I'm a Matlab user analyzing electrophysiology data. Recently I found that fseek over NFS shared file is slow if relative offset (distance in bytes from the current position to the new position) is more than around 10^5 or 10^6.
I checked this with C and it seems fseek is slow, while lseek is not. Are there any good way to improve fseek speed over NFS? Copying the file to local drive is not an option because I'm working on real-time data.
When the relative offset is less than this, the fseek speed is around 10^-7 seconds. When the relative offset is larger than this, the fseek speed is about 10^-4 seconds. lseek speed is 10^-7 order regardless of the offset size. I'm using CentOS6, tested with Matlab and Eclipse.
Any advice is welcome.
Below is my test code. Since fseek converts SEEK_CUR to SEEK_SET internally, I'm using SEEK_SET when SEEK_CUR would be appropriate.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <sys/types.h>
#include <fcntl.h>
int test_fseek(char *fn);
int test_seek(char *fn);
void make_testfile(char *fn, long long fsize);
int main(void)
{
// char *fn = "/tmp/seek_test_junk"; // local file
char *fn = "/net/tera4/raiddata/0/_NAS_NFS_Exports_/shar/seek_test_junk"; // over nfs
long long fsize = 1000LL*1000*1000*4;
make_testfile(fn,fsize);
test_fseek(fn);
test_seek(fn);
return 0;
}
int test_fseek(char *fn)
{
int read_size = 20;
long seek_size, curr_pos=0;
char read_buff[20];
struct timespec ts0,ts1;
double rlapse, slapse;
FILE *fd;
fd = fopen(fn,"r");
if (fd==NULL)
{
fprintf(stderr,"File open error.\n");
return -1;
}
if (read_size>fread(read_buff,1,read_size,fd))
{
fprintf(stderr,"Error reading at 0.\n");
return(-1);
}
curr_pos = curr_pos+read_size;
//seek_size = 10000;
seek_size = 1e7;
while(seek_size<=1e7 && 10000<=seek_size)
{
clock_gettime(CLOCK_REALTIME,&ts0);
if (0!=fseek(fd,seek_size+curr_pos,SEEK_SET))
{
fprintf(stderr,"Error seeking at seek_size:%ld\n",seek_size);
return(-1);
}
clock_gettime(CLOCK_REALTIME,&ts1);
slapse = 1.0e-9 * (ts1.tv_nsec-ts0.tv_nsec + 1000000000L*(ts1.tv_sec-ts0.tv_sec));
curr_pos = curr_pos+seek_size;
clock_gettime(CLOCK_REALTIME,&ts0);
if (read_size>fread(read_buff,1,read_size,fd))
{
fprintf(stderr,"Error reading at seek_size: %ld\n",seek_size);
return(-1);
}
clock_gettime(CLOCK_REALTIME,&ts1);
rlapse = 1.0e-9 * (ts1.tv_nsec-ts0.tv_nsec + 1000000000L*(ts1.tv_sec-ts0.tv_sec));
curr_pos = curr_pos+read_size;
printf("Skipsize\t%ld\tRead\t%g\tFseek\t%g\n",seek_size,rlapse,slapse);
//seek_size = (long)(seek_size*1.5);
seek_size = (long)(seek_size/1.5);
}
fclose(fd);
printf("Finish\n");
return (0);
}
int test_seek(char *fn)
{
int read_size = 20;
long seek_size, curr_pos=0;
char read_buff[20];
struct timespec ts0,ts1;
double rlapse, slapse;
int fd;
fd = open(fn,O_RDONLY);
if (fd<-1)
{
fprintf(stderr,"File open error.\n");
return -1;
}
if (read_size>read(fd,read_buff,read_size))
{
fprintf(stderr,"Error reading at 0.\n");
return(-1);
}
curr_pos = curr_pos+read_size;
//seek_size = 10000;
seek_size = 1e7;
while(seek_size<=1e7 && 10000<=seek_size)
{
clock_gettime(CLOCK_REALTIME,&ts0);
if (0>lseek(fd,seek_size+curr_pos,0))
{
fprintf(stderr,"Error seeking at seek_size: %ld\n",seek_size);
return(-1);
}
clock_gettime(CLOCK_REALTIME,&ts1);
slapse = 1.0e-9 * (ts1.tv_nsec-ts0.tv_nsec + 1000000000L*(ts1.tv_sec-ts0.tv_sec));
curr_pos = curr_pos+seek_size;
clock_gettime(CLOCK_REALTIME,&ts0);
if (read_size>read(fd,read_buff,read_size))
{
fprintf(stderr,"Error reading at seek_size: %ld\n",seek_size);
return(-1);
}
clock_gettime(CLOCK_REALTIME,&ts1);
rlapse = 1.0e-9 * (ts1.tv_nsec-ts0.tv_nsec + 1000000000L*(ts1.tv_sec-ts0.tv_sec));
curr_pos = curr_pos+read_size;
printf("Skipsize\t%ld\tRead\t%g\tLseek\t%g\n",seek_size,rlapse,slapse);
//seek_size = (long)(seek_size*1.5);
seek_size = (long)(seek_size/1.5);
}
close(fd);
printf("Finish\n");
return (0);
}
void make_testfile(char *fn, long long fsize)
{
FILE *fd;
// test if file already exists
fd = fopen(fn,"r");
if (fd!=NULL)
{
if (0==fseek(fd,fsize,SEEK_SET))
{
fclose(fd);
printf("%s already exists.\n",fn);
return;
}else
{
fclose(fd);
printf("%s is too small.\n",fn);
}
}
else{
printf("%s does not exist.\n",fn);
}
fd = fopen(fn,"w");
if (fd==NULL)
{
fprintf(stderr,"File open error.\n");
return;
}
long long blksize = 1000LL*1000*100;
int *rndbuf;
long i;
long long j;
rndbuf = malloc(blksize*sizeof(int));
if (rndbuf==NULL)
{
fprintf(stderr,"Memory alloc error.\n");
return;
}
for (i = 0;i<fsize/blksize/sizeof(int);i++)
{
for (j = 0;j<blksize;j++)
{
rndbuf[j]=rand();
}
fwrite(rndbuf,sizeof(int),blksize,fd);
printf("%lld bytes written.\n", (i+1) * blksize * sizeof(int));
}
fclose(fd);
}
Additional information: (part of IP addr changed to xx or yy)
Result of nfstat -m:
/net/p390/common from p390:/common/
Flags: rw,nosuid,nodev,relatime,vers=4,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,port=0,timeo=600,retrans=2,sec=sys,clientaddr=xx.xx.xx.201,minorversion=0,local_lock=none,addr=xx.xx.xx.202
/net/tera1/raid0/data/_NAS_NFS_Exports_ from tera1:/raid0/data/_NAS_NFS_Exports_
Flags: rw,nosuid,nodev,relatime,vers=3,rsize=262144,wsize=262144,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=xx.xx.xx.245,mountvers=3,mountport=52591,mountproto=udp,local_lock=none,addr=xx.xx.xx.245
/net/tera1/raid0/data/_NAS_NFS_Exports_/share from tera1:/raid0/data/_NAS_NFS_Exports_/share
Flags: rw,nosuid,nodev,relatime,vers=3,rsize=262144,wsize=262144,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=xx.xx.xx.245,mountvers=3,mountport=52591,mountproto=udp,local_lock=none,addr=xx.xx.xx.245
/net/z80/home from z80:/home/
Flags: rw,nosuid,nodev,relatime,vers=4,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,port=0,timeo=600,retrans=2,sec=sys,clientaddr=xx.xx.xx.201,minorversion=0,local_lock=none,addr=xx.xx.xx.205
/net/tera4/raiddata/0/_NAS_NFS_Exports_ from tera4:/raiddata/0/_NAS_NFS_Exports_
Flags: rw,nosuid,nodev,relatime,vers=3,rsize=524288,wsize=524288,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=xx.xx.xx.247,mountvers=3,mountport=49573,mountproto=udp,local_lock=none,addr=xx.xx.xx.247
/net/tera4/raiddata/0/_NAS_NFS_Exports_/share from tera4:/raiddata/0/_NAS_NFS_Exports_/share
Flags: rw,nosuid,nodev,relatime,vers=3,rsize=524288,wsize=524288,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=xx.xx.xx.247,mountvers=3,mountport=49573,mountproto=udp,local_lock=none,addr=xx.xx.xx.247
/net/p390/home from p390:/home/
Flags: rw,nosuid,nodev,relatime,vers=4,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,port=0,timeo=600,retrans=2,sec=sys,clientaddr=xx.xx.xx.201,minorversion=0,local_lock=none,addr=xx.xx.xx.202
/net/sinuhe/data from sinuhe:/data
Flags: rw,nosuid,nodev,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=yy.yy.yy.71,mountvers=3,mountport=892,mountproto=udp,local_lock=none,addr=yy.yy.yy.71
/net/sinuhe/opt from sinuhe:/opt
Flags: rw,nosuid,nodev,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=yy.yy.yy.71,mountvers=3,mountport=892,mountproto=udp,local_lock=none,addr=yy.yy.yy.71
Result of mount -v:
/dev/mapper/vg_megprec-LogVol01 on / type ext4 (rw)
proc on /proc type proc (rw)
sysfs on /sys type sysfs (rw)
devpts on /dev/pts type devpts (rw,gid=5,mode=620)
tmpfs on /dev/shm type tmpfs (rw,rootcontext="system_u:object_r:tmpfs_t:s0")
/dev/sdf1 on /boot type ext4 (rw)
none on /proc/sys/fs/binfmt_misc type binfmt_misc (rw)
sunrpc on /var/lib/nfs/rpc_pipefs type rpc_pipefs (rw)
p390:/common on /net/p390/common type nfs (rw,nosuid,nodev,intr,sloppy,vers=4,addr=xx.xx.xx.202,clientaddr=xx.xx.xx.201)
tera1:/raid0/data/_NAS_NFS_Exports_ on /net/tera1/raid0/data/_NAS_NFS_Exports_ type nfs (rw,nosuid,nodev,intr,sloppy,addr=xx.xx.xx.245)
tera1:/raid0/data/_NAS_NFS_Exports_/share on /net/tera1/raid0/data/_NAS_NFS_Exports_/share type nfs (rw,nosuid,nodev,intr,sloppy,addr=xx.xx.xx.245)
z80:/home on /net/z80/home type nfs (rw,nosuid,nodev,intr,sloppy,vers=4,addr=xx.xx.xx.205,clientaddr=xx.xx.xx.201)
tera4:/raiddata/0/_NAS_NFS_Exports_ on /net/tera4/raiddata/0/_NAS_NFS_Exports_ type nfs (rw,nosuid,nodev,intr,sloppy,addr=xx.xx.xx.247)
tera4:/raiddata/0/_NAS_NFS_Exports_/share on /net/tera4/raiddata/0/_NAS_NFS_Exports_/share type nfs (rw,nosuid,nodev,intr,sloppy,addr=xx.xx.xx.247)
p390:/home on /net/p390/home type nfs (rw,nosuid,nodev,intr,sloppy,vers=4,addr=xx.xx.xx.202,clientaddr=xx.xx.xx.201)
sinuhe:/data on /net/sinuhe/data type nfs (rw,nosuid,nodev,intr,sloppy,addr=yy.yy.yy.71)
sinuhe:/opt on /net/sinuhe/opt type nfs (rw,nosuid,nodev,intr,sloppy,addr=yy.yy.yy.71)