0

I'm a Matlab user analyzing electrophysiology data. Recently I found that fseek over NFS shared file is slow if relative offset (distance in bytes from the current position to the new position) is more than around 10^5 or 10^6.

I checked this with C and it seems fseek is slow, while lseek is not. Are there any good way to improve fseek speed over NFS? Copying the file to local drive is not an option because I'm working on real-time data.

When the relative offset is less than this, the fseek speed is around 10^-7 seconds. When the relative offset is larger than this, the fseek speed is about 10^-4 seconds. lseek speed is 10^-7 order regardless of the offset size. I'm using CentOS6, tested with Matlab and Eclipse.

Any advice is welcome.

Below is my test code. Since fseek converts SEEK_CUR to SEEK_SET internally, I'm using SEEK_SET when SEEK_CUR would be appropriate.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <sys/types.h>
#include <fcntl.h>

int test_fseek(char *fn);
int test_seek(char *fn);
void make_testfile(char *fn, long long fsize);

int main(void)
{
//  char *fn = "/tmp/seek_test_junk";   // local file
    char *fn = "/net/tera4/raiddata/0/_NAS_NFS_Exports_/shar/seek_test_junk"; // over nfs
    long long fsize = 1000LL*1000*1000*4;
    make_testfile(fn,fsize);
    test_fseek(fn);
    test_seek(fn);
    return 0;
}

int test_fseek(char *fn)
{
    int read_size = 20;
    long seek_size, curr_pos=0;
    char read_buff[20];
    struct timespec ts0,ts1;
    double rlapse, slapse;

    FILE *fd;
    fd = fopen(fn,"r");
    if (fd==NULL)
    {
        fprintf(stderr,"File open error.\n");
        return -1;
    }
    if (read_size>fread(read_buff,1,read_size,fd))
    {
        fprintf(stderr,"Error reading at 0.\n");
        return(-1);
    }
    curr_pos = curr_pos+read_size;
    //seek_size = 10000;
    seek_size = 1e7;
    while(seek_size<=1e7 && 10000<=seek_size)
    {
        clock_gettime(CLOCK_REALTIME,&ts0);
        if (0!=fseek(fd,seek_size+curr_pos,SEEK_SET))
        {
            fprintf(stderr,"Error seeking at seek_size:%ld\n",seek_size);
            return(-1);
        }
        clock_gettime(CLOCK_REALTIME,&ts1);
        slapse = 1.0e-9 * (ts1.tv_nsec-ts0.tv_nsec + 1000000000L*(ts1.tv_sec-ts0.tv_sec));
        curr_pos = curr_pos+seek_size;

        clock_gettime(CLOCK_REALTIME,&ts0);
        if (read_size>fread(read_buff,1,read_size,fd))
        {
            fprintf(stderr,"Error reading at seek_size: %ld\n",seek_size);
            return(-1);
        }
        clock_gettime(CLOCK_REALTIME,&ts1);
        rlapse = 1.0e-9 * (ts1.tv_nsec-ts0.tv_nsec + 1000000000L*(ts1.tv_sec-ts0.tv_sec));
        curr_pos = curr_pos+read_size;
        printf("Skipsize\t%ld\tRead\t%g\tFseek\t%g\n",seek_size,rlapse,slapse);
        //seek_size = (long)(seek_size*1.5);
        seek_size = (long)(seek_size/1.5);
    }
    fclose(fd);
    printf("Finish\n");
    return (0);
}

int test_seek(char *fn)
{
    int read_size = 20;
    long seek_size, curr_pos=0;
    char read_buff[20];
    struct timespec ts0,ts1;
    double rlapse, slapse;

    int fd;
    fd = open(fn,O_RDONLY);
    if (fd<-1)
    {
        fprintf(stderr,"File open error.\n");
        return -1;
    }
    if (read_size>read(fd,read_buff,read_size))
    {
        fprintf(stderr,"Error reading at 0.\n");
        return(-1);
    }
    curr_pos = curr_pos+read_size;
    //seek_size = 10000;
    seek_size = 1e7;
    while(seek_size<=1e7 && 10000<=seek_size)
    {
        clock_gettime(CLOCK_REALTIME,&ts0);
        if (0>lseek(fd,seek_size+curr_pos,0))
        {
            fprintf(stderr,"Error seeking at seek_size: %ld\n",seek_size);
            return(-1);
        }
        clock_gettime(CLOCK_REALTIME,&ts1);
        slapse = 1.0e-9 * (ts1.tv_nsec-ts0.tv_nsec + 1000000000L*(ts1.tv_sec-ts0.tv_sec));

        curr_pos = curr_pos+seek_size;

        clock_gettime(CLOCK_REALTIME,&ts0);
        if (read_size>read(fd,read_buff,read_size))
        {
            fprintf(stderr,"Error reading at seek_size: %ld\n",seek_size);
            return(-1);
        }
        clock_gettime(CLOCK_REALTIME,&ts1);
        rlapse = 1.0e-9 * (ts1.tv_nsec-ts0.tv_nsec + 1000000000L*(ts1.tv_sec-ts0.tv_sec));
        curr_pos = curr_pos+read_size;

        printf("Skipsize\t%ld\tRead\t%g\tLseek\t%g\n",seek_size,rlapse,slapse);
        //seek_size = (long)(seek_size*1.5);
        seek_size = (long)(seek_size/1.5);
    }
    close(fd);
    printf("Finish\n");
    return (0);
}

void make_testfile(char *fn, long long fsize)
{
    FILE *fd;
    // test if file already exists
    fd = fopen(fn,"r");
    if (fd!=NULL)
    {
        if (0==fseek(fd,fsize,SEEK_SET))
        {
            fclose(fd);
            printf("%s already exists.\n",fn);
            return;
        }else
        {
            fclose(fd);
            printf("%s is too small.\n",fn);
        }
    }
    else{
        printf("%s does not exist.\n",fn);
    }
    fd = fopen(fn,"w");
    if (fd==NULL)
    {
        fprintf(stderr,"File open error.\n");
        return;
    }
    long long blksize = 1000LL*1000*100;
    int *rndbuf;
    long i;
    long long j;
    rndbuf = malloc(blksize*sizeof(int));
    if (rndbuf==NULL)
    {
        fprintf(stderr,"Memory alloc error.\n");
        return;
    }

    for (i = 0;i<fsize/blksize/sizeof(int);i++)
    {
        for (j = 0;j<blksize;j++)
        {
            rndbuf[j]=rand();
        }
        fwrite(rndbuf,sizeof(int),blksize,fd);
        printf("%lld bytes written.\n", (i+1) * blksize * sizeof(int));
    }
    fclose(fd);
}

Additional information: (part of IP addr changed to xx or yy)

Result of nfstat -m:

/net/p390/common from p390:/common/
 Flags: rw,nosuid,nodev,relatime,vers=4,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,port=0,timeo=600,retrans=2,sec=sys,clientaddr=xx.xx.xx.201,minorversion=0,local_lock=none,addr=xx.xx.xx.202

/net/tera1/raid0/data/_NAS_NFS_Exports_ from tera1:/raid0/data/_NAS_NFS_Exports_
 Flags: rw,nosuid,nodev,relatime,vers=3,rsize=262144,wsize=262144,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=xx.xx.xx.245,mountvers=3,mountport=52591,mountproto=udp,local_lock=none,addr=xx.xx.xx.245

/net/tera1/raid0/data/_NAS_NFS_Exports_/share from tera1:/raid0/data/_NAS_NFS_Exports_/share
 Flags: rw,nosuid,nodev,relatime,vers=3,rsize=262144,wsize=262144,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=xx.xx.xx.245,mountvers=3,mountport=52591,mountproto=udp,local_lock=none,addr=xx.xx.xx.245

/net/z80/home from z80:/home/
 Flags: rw,nosuid,nodev,relatime,vers=4,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,port=0,timeo=600,retrans=2,sec=sys,clientaddr=xx.xx.xx.201,minorversion=0,local_lock=none,addr=xx.xx.xx.205

/net/tera4/raiddata/0/_NAS_NFS_Exports_ from tera4:/raiddata/0/_NAS_NFS_Exports_
 Flags: rw,nosuid,nodev,relatime,vers=3,rsize=524288,wsize=524288,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=xx.xx.xx.247,mountvers=3,mountport=49573,mountproto=udp,local_lock=none,addr=xx.xx.xx.247

/net/tera4/raiddata/0/_NAS_NFS_Exports_/share from tera4:/raiddata/0/_NAS_NFS_Exports_/share
 Flags: rw,nosuid,nodev,relatime,vers=3,rsize=524288,wsize=524288,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=xx.xx.xx.247,mountvers=3,mountport=49573,mountproto=udp,local_lock=none,addr=xx.xx.xx.247

/net/p390/home from p390:/home/
 Flags: rw,nosuid,nodev,relatime,vers=4,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,port=0,timeo=600,retrans=2,sec=sys,clientaddr=xx.xx.xx.201,minorversion=0,local_lock=none,addr=xx.xx.xx.202
/net/sinuhe/data from sinuhe:/data
 Flags: rw,nosuid,nodev,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=yy.yy.yy.71,mountvers=3,mountport=892,mountproto=udp,local_lock=none,addr=yy.yy.yy.71

/net/sinuhe/opt from sinuhe:/opt
 Flags: rw,nosuid,nodev,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=yy.yy.yy.71,mountvers=3,mountport=892,mountproto=udp,local_lock=none,addr=yy.yy.yy.71

Result of mount -v:

/dev/mapper/vg_megprec-LogVol01 on / type ext4 (rw)
proc on /proc type proc (rw)
sysfs on /sys type sysfs (rw)
devpts on /dev/pts type devpts (rw,gid=5,mode=620)
tmpfs on /dev/shm type tmpfs (rw,rootcontext="system_u:object_r:tmpfs_t:s0")
/dev/sdf1 on /boot type ext4 (rw)
none on /proc/sys/fs/binfmt_misc type binfmt_misc (rw)
sunrpc on /var/lib/nfs/rpc_pipefs type rpc_pipefs (rw)
p390:/common on /net/p390/common type nfs (rw,nosuid,nodev,intr,sloppy,vers=4,addr=xx.xx.xx.202,clientaddr=xx.xx.xx.201)
tera1:/raid0/data/_NAS_NFS_Exports_ on /net/tera1/raid0/data/_NAS_NFS_Exports_ type nfs (rw,nosuid,nodev,intr,sloppy,addr=xx.xx.xx.245)
tera1:/raid0/data/_NAS_NFS_Exports_/share on /net/tera1/raid0/data/_NAS_NFS_Exports_/share type nfs (rw,nosuid,nodev,intr,sloppy,addr=xx.xx.xx.245)
z80:/home on /net/z80/home type nfs (rw,nosuid,nodev,intr,sloppy,vers=4,addr=xx.xx.xx.205,clientaddr=xx.xx.xx.201)
tera4:/raiddata/0/_NAS_NFS_Exports_ on /net/tera4/raiddata/0/_NAS_NFS_Exports_ type nfs (rw,nosuid,nodev,intr,sloppy,addr=xx.xx.xx.247)
tera4:/raiddata/0/_NAS_NFS_Exports_/share on /net/tera4/raiddata/0/_NAS_NFS_Exports_/share type nfs (rw,nosuid,nodev,intr,sloppy,addr=xx.xx.xx.247)
p390:/home on /net/p390/home type nfs (rw,nosuid,nodev,intr,sloppy,vers=4,addr=xx.xx.xx.202,clientaddr=xx.xx.xx.201)
sinuhe:/data on /net/sinuhe/data type nfs (rw,nosuid,nodev,intr,sloppy,addr=yy.yy.yy.71)
sinuhe:/opt on /net/sinuhe/opt type nfs (rw,nosuid,nodev,intr,sloppy,addr=yy.yy.yy.71)
Masao
  • 1
  • 1
  • Which NFS version are you using? What options are you using to your `mount` command? What is the server - Netapp? – Mark Setchell Jul 29 '17 at 07:53
  • File server is Thecus N4810. It supports both NFS 3 and 4, but as I use automount to /net/tera4/raiddata/0/_NAS_NFS_Exports_/, I think I'm using NFS 3. I am not sure what options are given to mount command in automount... – Masao Jul 30 '17 at 09:05
  • I tried to mount using mount command in NFS4 style, like $ sudo mkdir /mnt/tera4 $ sudo mount tera4:/share /mnt/tera4 but the result is same. – Masao Jul 30 '17 at 09:22
  • It may be worth running `nfsstat -m` and `mount -v` to be sure of your setup - then click `edit` under your question and paste the details. I don't have a solution yet, but the details may be relevant, – Mark Setchell Jul 30 '17 at 09:28
  • Thank you , @Mark. I added the result of nfsstat and mount result. There seem to be many nfs mounts (both v3 and v4), and what I am checking on is tera4 for now. I also get similar result (performance drop) with tera1, p390 and sinuhe. I appreciate if you find something strange. – Masao Jul 31 '17 at 11:05
  • *it seems fseek is slow, while lseek is not.* Not surprising. `lseek()` merely sets the current offset on the NFS client process's file descriptor. `fseek()` may do read-ahead to fill up its buffer after the current position is updated. You can run the process under `strace`. Change your test program to only to `fseek()` (so other calls don't pollute the output) and see if the `fseek()` calls are associated with `read()` system calls that actually read data. Something like `strace -o /path/to/output/file /your/test/program` You can add `-t` or `-tt` to the options to get timing data. – Andrew Henle Aug 03 '17 at 15:44
  • Thank you, @Andrew, for suggesting the read-ahead buffer and strace command. I tested as you wrote and found that after lseek, it actually reads 262144 (==2^18) bytes, not 20 bytes! This is a large overhead for network access... – Masao Aug 05 '17 at 14:42

0 Answers0