lzss decoding EOF character issue

Question

I am using Okamura's implementation of lzss : https://oku.edu.mie-u.ac.jp/~okumura/compression/lzss.c

int enc_i = 0;


int getbit(int n, unsigned char *enco_buf) /* get n bits */
{
    int i, x;
    static int buf, mask = 0;

    x = 0;
    for (i = 0; i < n; i++) {
        if (mask == 0) {
            //if ((buf = fgetc(infile)) == EOF) return EOF;
            //if ((buf = fgetc(infile)) == EOF) return EOF;
            if((buf = enco_buf[enc_i]) == '\0') break;
            //buf = enco_buf[enc_i];
            enc_i++;
            //printf("\nValue of enc : %d\n", enc_i);
            mask = 128;
        }
        x <<= 1;
        if (buf & mask) x++;
        mask >>= 1;

    }
    return x;
}

void decode(unsigned char *enco_buf)
{
    int i, j, k, r, c;
    outfile = fopen ("test_lib.txt", "wb");

    for (i = 0; i < N - F; i++) buffer[i] = ' ';
    r = N - F;
    while ((c = getbit(1, enco_buf)) != '\0') {
        if (c) {
            if ((c = getbit(8, enco_buf)) == '\0') break;
            fputc(c, outfile);
            buffer[r++] = c;  r &= (N - 1);
        } else {
            if ((i = getbit(EI, enco_buf)) == '\0') break;
            if ((j = getbit(EJ, enco_buf)) == '\0') break;
            for (k = 0; k <= j + 1; k++) {
                c = buffer[(i + k) & (N - 1)];
                fputc(c, outfile);
                buffer[r++] = c;  r &= (N - 1);
            }
        }
    }
    fclose(outfile);
}

Rather than taking the input from file, I am taking an encoded buffer which I am passing to the function decode() which in-turns passes to getbit(). But it is decoding upto a certain number of bits ie. 43. What I am guessing is I put '\0' instead of EOF (as it was in file end marker) to detect string end. and that might be the problem where break statement is executed finding '\0'. It might not be true but It might be the case. The encoding part has no problem as I verified it with the original code. Can anyone suggest where might be the problem?

EDIT : source code:

/* LZSS encoder-decoder (Haruhiko Okumura; public domain) */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "lzss_lib.h"

#define EI 11  /* typically 10..13 */
#define EJ  4  /* typically 4..5 */
#define P   1  /* If match length <= P then output one character */
#define N (1 << EI)  /* buffer size */
#define F ((1 << EJ) + 1)  /* lookahead buffer size */

int bit_buffer = 0, bit_mask = 128;
unsigned long codecount = 0, textcount = 0;
unsigned char buffer[N * 2];
int payload_i = 0;
int enc_i = 0;

FILE *infile, *outfile;

void error(void)
{
    printf("Output error\n");  exit(1);
}

void putbit1(unsigned char *put_buf0)
{
    bit_buffer |= bit_mask;
    if ((bit_mask >>= 1) == 0) {
        put_buf0[payload_i] = bit_buffer;
        payload_i++;
        bit_buffer = 0;  bit_mask = 128;  codecount++;
    }
}

void putbit0(unsigned char *put_buf1)
{
    if ((bit_mask >>= 1) == 0) {
        put_buf1[payload_i] = bit_buffer;
        payload_i++;
        bit_buffer = 0;  bit_mask = 128;  codecount++;
    }
}

void flush_bit_buffer(unsigned char *flush_bit)
{
    if (bit_mask != 128) {
        flush_bit[payload_i] = bit_buffer;
        codecount++;
    }
}

void output1(int c, unsigned char *buf_output1)
{
    int mask;

    putbit1(buf_output1);
    mask = 256;
    while (mask >>= 1) {
        if (c & mask) putbit1(buf_output1);
        else putbit0(buf_output1);
    }
}

void output2(int x, int y, unsigned char *buf_output2)
{
    int mask;

    putbit0(buf_output2);
    mask = N;
    while (mask >>= 1) {
        if (x & mask) putbit1(buf_output2);
        else putbit0(buf_output2);
    }
    mask = (1 << EJ);
    while (mask >>= 1) {
        if (y & mask) putbit1(buf_output2);
        else putbit0(buf_output2);
    }
}

void encode(unsigned char *string_buf, unsigned char *out_buf, unsigned long *out_buf_len)
{

    int i, j, f1, x, y, r, s, bufferend, c, tt;
    tt = 0;
    for (i = 0; i < N - F; i++) buffer[i] = ' ';
    for (i = N - F; i < N * 2; i++) {
        if((c = string_buf[tt]) == '\0') break;
        buffer[i] = c;  textcount++; tt++;
    }
    bufferend = i;  r = N - F;  s = 0;
    while (r < bufferend) {
        f1 = (F <= bufferend - r) ? F : bufferend - r;
        x = 0;  y = 1;  c = buffer[r];
        for (i = r - 1; i >= s; i--)
            if (buffer[i] == c) {
                for (j = 1; j < f1; j++)
                    if (buffer[i + j] != buffer[r + j]) break;
                if (j > y) {
                    x = i;  y = j;
                }
            }

        if (y <= P) {  y = 1;  output1(c, out_buf);  }
        else output2(x & (N - 1), y - 2, out_buf);

        r += y;  s += y;
        if (r >= N * 2 - F) {
            for (i = 0; i < N; i++) buffer[i] = buffer[i + N];
            bufferend -= N;  r -= N;  s -= N;
            while (bufferend < N * 2) {
                if((c = string_buf[tt]) == '\0') break;
                buffer[bufferend++] = c;  textcount++; tt++;
            }

        }

    }

    flush_bit_buffer(out_buf);
    *out_buf_len = codecount;
    printf("text:  %ld bytes\n", textcount);
    printf("code:  %ld bytes (%ld%%)\n", codecount, (codecount * 100) / textcount);

}

int getbit(int n, unsigned char *enco_buf) /* get n bits */
{
    int i, x;
    static int buf, mask = 0;

    x = 0;
    for (i = 0; i < n; i++) {
        if (mask == 0) {
            //if ((buf = fgetc(infile)) == EOF) return EOF;
            //if ((buf = fgetc(infile)) == EOF) return EOF;
            if((buf = enco_buf[enc_i]) == '\0') break;
            //buf = enco_buf[enc_i];
            enc_i++;
            printf("\nValue of enc : %d\n", enc_i);
            mask = 128;
        }
        x <<= 1;
        if (buf & mask) x++;
        mask >>= 1;

    }

    return x;
}

void decode(unsigned char *enco_buf)
{
    int i, j, k, r, c;
    outfile = fopen ("test_lib.txt", "wb");

    for (i = 0; i < N - F; i++) buffer[i] = ' ';
    r = N - F;
    while ((c = getbit(1, enco_buf)) != '\0') {
        if (c) {
            if ((c = getbit(8, enco_buf)) == '\0') break;
            fputc(c, outfile);
            buffer[r++] = c;  r &= (N - 1);
        } else {
            if ((i = getbit(EI, enco_buf)) == '\0') break;
            if ((j = getbit(EJ, enco_buf)) == '\0') break;
            for (k = 0; k <= j + 1; k++) {
                c = buffer[(i + k) & (N - 1)];
                fputc(c, outfile);
                buffer[r++] = c;  r &= (N - 1);
            }
        }
    }
    fclose(outfile);
}

and the test example :

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "lzss_lib.h"

unsigned char compr_data[4096];
unsigned long pw = 0;
unsigned long payload_len = 0;
volatile int errno;
unsigned char *unc_data = "d(2306):AuthorisationScheme:RADIUSserveratfd04:bd3:80e8:1::1usingPAPD/6LoWPANd(2306):WritingModule:SecurityConfigD/6LoWPANd(2306):WritingModule:RunCoordinatorD/6LoWPANd(2306):RequestingmoduleaddressD/6LoWPANd(2306):WritingModule:GetAddressD/smartcard-jni(2781):SmartCard_state_update_callback:status=6D/SmartCardNative(2781):status=6D/smartcard(2781):PN532Smartcard_loop_threadexitD/smartcard-jni(2781):SmartCard_loop_thread:SMARTCARD_STATUS_ABORTD/smartcard(2781):Smartcard_loop_uninitD/smartcard(2781):2D/serialcomm_pn532(2781):PN532:Readingfrom/dev/ttyUSB0-->D/smartcard(2781):Received(0x3)fromPN532:(dataBuf[0]:0x1)(0x1f5988)D/smartcard(2781):ReceivedStatusfromPN532:OK(cmd:0x2)D/smartcard-jni(2781):SmartCard_listener_update_callbackD/smartcard(2781):Received(0x1c2)fromPN532:(dataBuf[0]:0x32)(0x1f5988)D/smartcard(2781):vd(2306):AuthorisationScheme:RADIUSserveratfd04:bd3:80e8:1::1usingPAPD/6LoWPANd(2306):";
FILE *outfile;

int main(void) {
    memset (compr_data, '\0', sizeof(compr_data));
    encode(unc_data, compr_data, &payload_len);

    decode(compr_data);

    return 0;
}

A buffer is not a string and it can have `\0`s anywhere in the middle. There is no way to indicate the end of the buffer with an out-of-bamd character. You **must** use a separate buffer length parameter. — n. m. could be an AI, May 14 '17 at 05:38
@n.m. in case there are '\0' and I need to pass the length i need to do strlen which will again end in '\0' , not the whole buffer, what should I do? Suppose the the buffer length can not be known by me beforehand — ninja.stop, May 14 '17 at 06:23
@IljaEverilä So what should I do? strlen won't help, also if i read the whole length of the buffer irrespective the actual length of it, it treats '\0' as data too and decodes it. the fgetc thing works well. but this buffer read is having problem. — ninja.stop, May 14 '17 at 07:10
Is a zero byte really special, or are you still mixing strings and data? Honest question, I'm not familiar with lzss. In general it's nothing special, when handling *data*. You have to have the length of the data buffer in a separate variable, as @n.m. pointed out. — Ilja Everilä, May 14 '17 at 07:14
you cannot use 0 as end of data indicator, you need to create separate counter and track length of the econded/decoded data. separate note: EOF != 0 — Iłya Bursov, May 14 '17 at 07:17
You need not and should not call strlen on something that isn't a string. You **must** know the buffer length in order to be able to implement this algorithm. There is no way around it. — n. m. could be an AI, May 14 '17 at 07:27
@n.m. I know my initialized array length which is 4096. I can read and decode upto that but in that case, it will treat '\0' to be also decoded and that is where the problem is. Can we continue to chat so that I can explain you more? — ninja.stop, May 14 '17 at 07:30
@n.m I am adding the full source code in edit. Please check. — ninja.stop, May 14 '17 at 07:33
@n.m. added the source and test example. If the encoded content is written in a file and then decoded with original implementation as mentioned in the question, it works well. But the buffer way having the problem. fgetc is handling this well. — ninja.stop, May 14 '17 at 07:38
`if((c = string_buf[tt]) == '\0') break;` This **only** works well on string data. It **doesn't work** on binary data. You happen to only use string data in your program, which may or may not be OK depending on what your program will ultimately do. If you want to ignore the possibility of encoding binary data, it may work for you in the short run, but you **cannot** ignore binary data when decoding. Once you know how to handle binary buffers, go back and change the encoding routine so that it works with binary data. — n. m. could be an AI, May 14 '17 at 07:48
Instead of breaking on `\0`. you have to **count** how many characters you have processed. Once that number equals to the number of characters you have in the input (the input buffer length), your processing is over. That's all there is to it. Your encoded buffer length **cannot** be 4096. 4096 is the **maximum** length your input buffer can hold. Your `encode` routine **must** return the length of its output buffer. It doesn't have to when it writes a file, because a file contains its own size. A buffer **does not** contain its own size, you **must** maintain the size separately. — n. m. could be an AI, May 14 '17 at 07:53
@n.m. yes. true. I found binary file handling as of now. DIdin't find binary buffer handling for such cases. can you please provide any content on this? — ninja.stop, May 14 '17 at 08:00
Count characters both on input annd on output, nothing more complicated than that. — n. m. could be an AI, May 14 '17 at 08:06

lzss decoding EOF character issue

0 Answers0

Linked