0

This program write on 2 files.
In the right file the string is "IL RITROVO AL 1° PIANO"
In the wrong file the string is "IL RITROVO AL 1NUL PIANO".
In the second case, the "°" charater has wrong econding; how can I detect this case before I write it?

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter; 

public class WrongWriter {
    static File wrongFile = new File("C:/Users/utente/Desktop/wrongFile.txt");
    static File rightFile = new File("C:/Users/utente/Desktop/rightFile.txt");


    public static void main(String[] args) throws IOException {

        byte[] wrongBytes = new byte[]{
                73, 76, 32, 82, 73, 84, 82, 79, 86, 79, 32, 65, 76, 32, 49, 0, 32, 80, 73, 65, 78, 79, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
                };


        write(wrongFile, wrongBytes) ;

        byte[] rightBytes = "IL RITROVO AL 1° PIANO".getBytes();

        write(rightFile, rightBytes) ;
    }



    static void write(File file, byte[] bytes) throws IOException{
        OutputStreamWriter stream = null; //10227
        stream =  new OutputStreamWriter( new FileOutputStream( file )  , "ISO-8859-15"); 
        stream.write( new String(  bytes ) ); 
        stream.flush();
        stream.close();

    }

}
Frizz1977
  • 1,121
  • 13
  • 21
  • Fist thing. Never use `String.getBytes()`. Always specify an encoding (such as utf-8). When decoding you can use a CharsetDecoder and specify the behavior when an unmappable character is encountered. http://docs.oracle.com/javase/7/docs/api/java/nio/charset/Charset.html#newDecoder() http://docs.oracle.com/javase/7/docs/api/java/nio/charset/CharsetDecoder.html#onUnmappableCharacter(java.nio.charset.CodingErrorAction) – Brett Okken Jul 01 '14 at 11:58

2 Answers2

0

String/char/Writer/Reader are Unicode text in java. (This makes java unique among other languages.) Java text can always contain any mix of scripts.

byte[]/InputStream/OutputStream are binary data in Java. To be interpreted as text they must be given their encoding.

So you can do:

OutputStreamWriter stream = null; //10227
stream =  new OutputStreamWriter( new FileOutputStream(file), "ISO-8859-15"); 
stream.print("IL RITROVO AL 1° PIANO"); 
stream.close();

The class OuputStreamWriter bridges this and writes the Unicode text into bytes having that enocoding.

In general the conversions are:

bytes[] inISO15 = "IL RITROVO AL 1° PIANO".getBytes("ISO-8859-15");
String s = new String(inISO15, "ISO-8859-15");

You took the OutputStream functionality in writing bytes, bypassing the conversion. This the should be done as:

stream.write(inISO5015);

But then better not use a Writer, but maybe immediately the FileOutputStream or a BufferedOutputStream.

Joop Eggen
  • 107,315
  • 7
  • 83
  • 138
  • My problem is "from a given byte array, how can I understand if this bytes are valid in a given encoding? " (in this case ISO-8856-1).
    I solved with the
    static public boolean isISO_8859_1
    method, but II'd like view something that do the same thing in java core libraries
    – Frizz1977 Jul 04 '14 at 09:46
  • ISO-8859-1 is a single-byte encoding, indistinguishable from any other ISO-8859-... even if Greek. Windows-1252 is a superset of ISO-8859-1 (with comma-like quotes). UTF-8 would be recognizable with a high probability. I wrote one a language+encoding detector, using frequency lists of the most frequent words in any language. – Joop Eggen Jul 04 '14 at 10:32
0

Thanks, but this is what I wanted, but I'm not be able to do with CharsetDecoder....

package dummy;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter; 
import java.io.UnsupportedEncodingException;

public class WrongWriter {
    static File wrongFile = new File("C:/Users/utente/Desktop/wrongFile.txt");
    static File rightFile = new File("C:/Users/utente/Desktop/rightFile.txt");


    public static void main(String[] args) throws IOException {
        
        byte[] wrongBytes = new byte[]{
                73, 76, 32, 82, 73, 84, 82, 79, 86, 79, 32, 65, 76, 32, 49, 0, 32, 80, 73, 65, 78, 79, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
                };

        if (CharacterChecker.isISO_8859_1(wrongBytes)) {
            write(wrongFile, wrongBytes) ;          
        } else{
            System.out.println("Bad input");
        }

        byte[] rightBytes = "IL RITROVO AL 1° PIANO".getBytes("ISO-8859-15");

        write(rightFile, rightBytes) ;
    }

    
    
    static void write(File file, byte[] bytes) throws IOException{
        OutputStreamWriter stream = null; //10227
        stream =  new OutputStreamWriter( new FileOutputStream( file )  , "ISO-8859-15"); 
        stream.write( new String(  bytes,  "ISO-8859-15" ) ); 
        stream.flush();
        stream.close();
        
    }
    
}
class CharacterChecker {

    
    static public boolean isISO_8859_1(byte[] bytes) throws UnsupportedEncodingException{ 

        for(int i=0;i< bytes.length;i++)
           {
               if( 
                      ( bytes[i]<32 && bytes[i] >=0) 
                      || (bytes[i]<-65 && bytes[i]>-69)
                      || bytes[i]==-72
                      || bytes[i]==-76 
                      || bytes[i]==-88 
                      || bytes[i]==-90 
                      || bytes[i]==-92
                      ) {
                   return false; 
               }
                   
           }

        return true;
    }
    
    static public boolean isISO_8859_1(String s) throws UnsupportedEncodingException{
        byte[] bytes = s.getBytes("ISO-8859-1"); 

        return isISO_8859_1(bytes);
    }
    

    static public String replaceNotISO_8859_1_characters(String s, char chracter) throws UnsupportedEncodingException{
        String cString = Character.toString(chracter);
        byte sobs = cString.getBytes("ISO-8859-1")[0];
        
        byte[] bytes = s.getBytes("ISO-8859-1");

        for(int i=0;i< bytes.length;i++)
           {
               if( 
                      ( bytes[i]<32 && bytes[i] >=0) 
                      || (bytes[i]<-65 && bytes[i]>-69)
                      || bytes[i]==-72
                      || bytes[i]==-76 
                      || bytes[i]==-88 
                      || bytes[i]==-90 
                      || bytes[i]==-92
                      ) {
                   bytes[i] = sobs;
               }
                   
           }
        
        return new String(bytes,"ISO-8859-1");
    }
    
    
}
marcolopes
  • 9,232
  • 14
  • 54
  • 65
Frizz1977
  • 1,121
  • 13
  • 21