5

I am trying to open a .txt file that is wholly Chinese. Can I use normal fopen/fclose procedures to it even though the stream would be 100% Unicode or are there any exlusive tools for handling wide characters? I'd be grateful for precise answers, I am a beginner programmer. I am using Linux with standard gcc.

I will attach my code, it compiles with no error but upon execution I get segmentation fault. I don't know what is wrong with it. The point of this programme is to copy each string of Chinese signs in which a specific sign from a given set is to be found and to write it in a separate file.

#include<stdio.h>
#include<stdlib.h>
#include<wchar.h>
#include <locale.h>
#define PLIK_IN in /*filenames*/
#define PLIK_OUT out
#define LKON 49 /*specifying the length of a string on the left from a desired sign*/
#define PKON 50 /*...and on the right*/
int wczytaj_pliki(FILE*, FILE*); /*open file*/
void krocz_po_pliku(FILE*, FILE*); /*search through file*/
int slownik(wchar_t); /*compare signs*/
void zapisz_pliki(FILE*, FILE*); /*write to file*/

void main(void)
{
    FILE *bin,*bout;
    setlocale(LC_CTYPE, "");

    wczytaj_pliki(bin, bout);
    krocz_po_pliku(bin, bout);
    zapisz_pliki(bin, bout);
}/*main*/

int slownik(wchar_t znak) /*compare characters*/
{
    wchar_t gznak1 = L'股', gznak2 = L'利', gznak3 = L'红';
    if ( ( znak == gznak1) || (znak == gznak2) || (znak == gznak3) ) return 1;
    return 0;
}/*slownik*/

void krocz_po_pliku(FILE* bin, FILE* bout) /*search through file*/
{
    wchar_t wch;
    wchar_t* kontekst;
    int i = 0, j, step = LKON, counter = 0, token = 0;

    while ( (wch = getwchar() ) != EOF )
    {
        if (!token) /*comparing consecutive signs*/
    {
        if ( slownik(wch) == 1 )
        {
            counter++;
            fprintf(bout,"###Wystapienie %d.\n\n", counter);
            if ( i<step ) step = i;
            fseek(bin,-step,1);
            j=0, token = 1;
        }/*if*/
        else i++;
    }/*if*/
    else /*writing consecutive signs within context*/
    {
        if ( j < LKON + PKON)
        {
            putwc(wch, bout);
            j++;
        }/*if*/
        else
        {
            fprintf(bout,"###\n\n");
            fflush(bout);
            token = 0;
        }/*else*/
    }/*else*/
    }/*while*/
        printf("Znalazlem %d wystapien\n", counter);
}/*krocz_po_pliku*/

int wczytaj_pliki(FILE* bin, FILE* bout)
{
    bin=fopen("PLIK_IN","r");
    bout=fopen("PLIK_OUT","w");
    rewind(bin);
    if(bin==NULL || bout==NULL)
{
    printf("Blad plikow\n");
    exit(0);
}/*if*/
    return 1;
}/*wczytaj pliki*/

void zapisz_pliki(FILE* bin, FILE* bout)
{
fclose(bin);
fclose(bout);
}
Makoto
  • 104,088
  • 27
  • 192
  • 230
yauser
  • 707
  • 2
  • 8
  • 19
  • 1
    First time I've ever seen the combination of Chinese character literals and Polish(?) variable names. – dan04 Nov 29 '11 at 22:27
  • 3
    100% Unicode? Not from concentrate? What would a file that's only 85% Unicode look like? – Kerrek SB Dec 07 '11 at 14:59

2 Answers2

3

Yes, fopen can open a file that contains any data, including Unicode data, as long as you can represent the filename in a char*. (On some platforms, namely Windows, files may have names that cannot be represented in a char*).

You will want to open the file in binary mode to prevent any new line substitution that may be done (unless the Unicode encoding is UTF-8 and then it doesn't matter), because the substitution will be done in terms of chars. Also, if the code units are more than one byte you will need to make sure you're reading them with the correct endianness.

Note that wchar_t isn't necessarily Unicode an may not be the right type for whatever Unicode encoding is being used by your files. And if your program supports multiple Unicode encodings do not use BOMs to guess which encoding a file uses.

bames53
  • 86,085
  • 15
  • 179
  • 244
  • You *can* `fopen` a file with a non-ANSI name on Windows by using its short name. – dan04 Nov 29 '11 at 22:24
  • Short names are not always available, and in the future Windows will move to a new file system (ReFS) which will not support them at all. – bames53 Oct 19 '12 at 01:25
1

Your problem might be caused by the fact, that you

#define PLIK_IN in /*filenames*/

and then

bin=fopen("PLIK_IN","r");

Your programme is trying to open a file named PLIK_IN and not a file namedin. If PLIK_IN doesn't exist, fopen returns 0. Passing 0 to rewind causes your executable to die.

If you'd like to open in, you should

#define PLIK_IN "in" /*filenames*/
/* ... */
bin=fopen(PLIK_IN,"r");

Same goes for PLIK_OUT.

Last but not the least, remember to code in English. It's a lingua franca in our business and using it significantly increases the number of people who can help you out :)

Jan
  • 11,636
  • 38
  • 47