I've written a jpeg codec. It's maintained at https://github.com/MalcolmMcLean/babyxrc however whilst you're welcome to take a look, or even use it, that doesn't really answer your question.
JPEG is based on 16x16 blocks for chromiance and 8x8 blocks for luminance. So it's not surprising that an initial version of your software crashes after the first 16x16 block. It's just a routine programming error. If you can't find it by reading the JEG spec, fire up an editor, and create a flat 32x32 image. Then look at the binary and see where it differs from yours.
Here's my loadscan for no sub-sampling
static int loadscanYuv111(JPEGHEADER *hdr, unsigned char *buff, FILE *fp)
{
short lum[64];
short Cb[64];
short Cr[64];
BITSTREAM *bs;
int i;
int ii;
int iii;
int iv;
int diffdc = 0;
int dcb = 0;
int dcr = 0;
int actableY;
int actableCb;
int actableCr;
int dctableY;
int dctableCb;
int dctableCr;
int count = 0;
int target;
int luminance;
int red;
int green;
int blue;
actableY = hdr->useac[0];
actableCb = hdr->useac[1];
actableCr = hdr->useac[2];
dctableY = hdr->usedc[0];
dctableCb = hdr->usedc[1];
dctableCr = hdr->usedc[2];
bs = bitstream(fp);
for(i=0;i<hdr->height;i+=8)
for(ii=0;ii<hdr->width;ii+=8)
{
if(hdr->dri && (count % hdr->dri) == 0 && count > 0 )
{
readmarker(bs);
diffdc = 0;
dcb = 0;
dcr = 0;
}
getblock(lum, hdr->dctable[dctableY], hdr->actable[actableY], bs);
lum[0] += diffdc;
diffdc = lum[0];
for(iv=0;iv<64;iv++)
lum[iv] *= hdr->qttable[hdr->useq[0]][iv];
unzigzag(lum);
idct8x8(lum);
getblock(Cb, hdr->dctable[dctableCb], hdr->actable[actableCb], bs);
Cb[0] += dcb;
dcb = Cb[0];
for(iv=0;iv<64;iv++)
Cb[iv] *= hdr->qttable[hdr->useq[1]][iv];
unzigzag(Cb);
idct8x8(Cb);
getblock(Cr, hdr->dctable[dctableCr], hdr->actable[actableCr], bs);
Cr[0] += dcr;
dcr = Cr[0];
for(iv=0;iv<64;iv++)
Cr[iv] *= hdr->qttable[hdr->useq[2]][iv];
unzigzag(Cr);
idct8x8(Cr);
for(iii=0;iii<8;iii++)
{
if( i + iii >= hdr->height)
break;
for(iv=0;iv<8;iv++)
{
if(ii + iv >= hdr->width)
break;
target = (i + iii) * hdr->width * 3 + (ii + iv) * 3;
luminance = lum[iii*8+iv]/64 + 128;
red = (int) (luminance + 1.402 * Cr[iii*8+iv]/64);
green = (int) (luminance - 0.34414 * Cb[iii*8+iv]/64 - 0.71414 * Cr[iii*8+iv]/64);
blue = (int) (luminance + 1.772 * Cb[iii*8+iv]/64);
red = clamp(red, 0, 255);
green = clamp(green, 0, 255);
blue = clamp(blue, 0, 255);
buff[target] = red;
buff[target+1] = green;
buff[target+2] = blue;
}
}
count++;
}
killbitstream(bs);
if(loadeoi(fp) == 0)
return 0;
return -1;
}
As you can see, the data is interleaved.
However if you got that wrong it would create a peculiar image of the correct dimensions, not a smaller image than expected.