So I've been working with the CC-CEDICT, a free downloadable Chinese-English dictionary. I've been using python to make some small changes and reformat the dictionary. When I ran code that just reorganized the dictionary as a csv file, I had no issues and the characters were written into the file as expected. Here is the code for that:
filename = 'cedict_ts.u8.txt'
newname = 'cedict_ts.u8.csv'
f = open(filename,'r')
allLines = f.readlines()
f.close()
newf = open(newname, 'w')
endofhash = False
for i in range(0, len(allLines)):
curLine = allLines[i]
if curLine[0] == '#':
newf.write(curLine)
else:
if(not endofhash):
newarr = ['Traditional','Simplified','Pinyin','Definition(s)\r\n']
newline = ','.join(newarr)
newf.write(newline)
endofhash = True
firstws = curLine.find(' ')
lsbrack = curLine.find('[')
rsbrack = curLine.find(']')
fslash = curLine.find('/')
lslash = curLine.rfind('/')
trad = curLine[0:firstws]
simp = curLine[firstws+1:lsbrack-1]
piny = curLine[lsbrack+1:rsbrack]
defin = curLine[fslash+1:lslash]
defin = defin.replace('/','; ')
defin = defin + '\r\n'
newarr = [trad, simp, piny, defin]
newline = ','.join(newarr)
newf.write(newline)
newf.close()
However, when I run a program that also changes the pinyin system and adds it to the dictionary, the content of the text file is gobbly-gook. But, as a test I had the program print out each line before it was written to the text file, and it prints to the terminal as expected. Here is the code that does that:
from pinyinConverter import *
filename = 'cedict_ts.u8.txt'
newname = 'cedict_ts_wpym.u8.csv'
f = open(filename,'r')
allLines = f.readlines()
f.close()
apy = readPinyinTextfile('pinyinchars.txt')
newf = open(newname, 'w')
endofhash = False
for i in range(0, len(allLines)):
curLine = allLines[i]
if curLine[0] == '#':
newf.write(curLine)
else:
if(not endofhash):
newarr = ['Traditional','Simplified','Pinyin','PinyinWithMarks','Definition(s)\r\n']
newline = ','.join(newarr)
newf.write(newline)
endofhash = True
firstws = curLine.find(' ')
lsbrack = curLine.find('[')
rsbrack = curLine.find(']')
fslash = curLine.find('/')
lslash = curLine.rfind('/')
trad = curLine[0:firstws]
simp = curLine[firstws+1:lsbrack-1]
piny = curLine[lsbrack+1:rsbrack]
split_piny = piny.split(' ')
for i in range(0, len(split_piny)):
curPin = split_piny[i]
newPin = convertPinyinSystem(curPin, apy)
split_piny[i] = newPin
pnwm = ' '.join(split_piny)
defin = curLine[fslash+1:lslash]
defin = defin.replace('/','; ')
defin = defin + '\r\n'
newarr = [trad, simp, piny, pnwm, defin]
newline = ','.join(newarr)
newf.write(newline)
newf.close()
And here is the pinyinConverter file code:
def convertPinyinSystem(inputString, allPinyin):
chars = ['a','e', 'i', 'o','u','u:']
tone = grabTone(inputString)
toneIdx = (tone - 1) * 2
hasIdx = -1
for i in range(0, len(chars)):
if(chars[i] in inputString):
hasIdx = i
newString = inputString
newString = newString.replace(str(tone),'')
if(not ('iu' in inputString)):
newChar = allPinyin[hasIdx][toneIdx:toneIdx+2]
else:
newChar = allPinyin[4][toneIdx:toneIdx+2]
newString = newString.replace(chars[hasIdx],newChar)
if(tone == 5):
newString = inputString
newString = newString.replace(str(tone),'')
return newString
elif(tone == -1):
return inputString
else:
return newString
def readPinyinTextfile(pinyintextfile):
f = open(pinyintextfile, 'r')
allLines = f.readlines()
f.close()
for i in range(0, len(allLines)):
curLine = allLines[i]
curLine = curLine[0:len(curLine)-1]
allLines[i] = curLine
return allLines
def grabTone(inputText):
isToneIdx = False
idx = 0
while(not isToneIdx):
isToneIdx = is_int(inputText[idx])
if(isToneIdx):
break
else:
idx += 1
if(idx == len(inputText)):
return -1
return int(inputText[idx])
def is_int(s):
try:
int(s)
return True
except ValueError:
return False
And the content of the pinyin chars.txt file is this:
āáăà
ēéĕè
īíĭì
ōóŏò
ūúŭù
ǖǘǚǜ
I'm on a 2009 MacBook Pro, running OSX version 10.8.5, python is version 2.7.6 and the coding of the dictionary is UTF-8. Also I know some of the code for doing the pinyin conversion is not optimized, but for this it doesn't really matter.