I'm trying to write a method to count the number of words when the content is in chinese and japanese. This should exclude the special characters / punctuations / whiteSpaces.
I tried creating a regex for each locale and find the words based on it. Tried looking for existing regex on internet but none of them seems to be working. My approach -
function countWords(text, locale) {
let wordCount = 0;
// Set the word boundary based on the locale
let wordBoundary = '\\b';
if (locale === 'ja') {
// Japanese word boundary
wordBoundary = '[\\p{Script=Hiragana}\\p{Script=Katakana}\\p{Script=Han}ー]+';
} else if (locale === 'zh') {
// Chinese word boundary
wordBoundary = '[\\p{Script=Han}]+';
}
const regex = new RegExp(wordBoundary, 'gu');
const matches = text.matchAll(regex);
for (const match of matches) {
wordCount++;
}
return wordCount;
}
I thought this should work, but I'm comparing the word count in MS word and using this logic, they are coming different