0

I've been working in the grapheme to phoneme conversion in Matlab and trying to produce a more generalized code to first break the word into the particular consonents,digraphs and their related vowels and segment each inputted string (word) into its grapheme form so that it can later produce the appropriate phonetic synthesis. however, due to the presence of a large number of rules and hence a huge loads of if-elseif-else, looping through each letter and presence of growing arrays, the time complexity has increased manifold and is not giving any result (it's showing busy, everytime I enter a string input through the console). So if you could please help me simplify the code, here's below:

prompt='Enter a string: ';
str=input(prompt,'s');
l=length(str);
grapheme=[];
i=1;

while (i<=l)

        if strcmpi(str(i),'b')  
            grapheme=[grapheme;{'b'}];
        elseif strcmpi(str(i:i+1),'bb')
            grapheme=[grapheme;{'b'}];
            i=i+1;
        elseif strcmpi(str(i),'d') 
            grapheme=[grapheme;{'d'}];
        elseif strcmpi(str(i:i+1),'dd')||strcmpi(str(i:i+1),'ed')
            grapheme=[grapheme;{'d'}];
            i=i+1;
        elseif strcmpi(str(i),'f')  
            grapheme=[grapheme;{'f'}];
        elseif strcmpi(str(i:i+1),'ff')||strcmpi(str(i:i+1),'ph')||strcmpi(str(i:i+1),'gh')
            grapheme=[grapheme;{'f'}];
            i=i+1;
        elseif strcmpi(str(i),'g')
            grapheme=[grapheme;{'g'}];          
        elseif  strcmpi(str(i:i+1),'gg')%||strcmpi(str(i:i+2),'gue')
           grapheme=[grapheme;{'g'}]; 
           i=i+1;
        elseif strcmpi(str(i),'h')
          grapheme=[grapheme;{'h'}];  
        elseif  strcmpi(str(i:i+1),'wh') && strcmpi(str(i+2),'o')
           grapheme=[grapheme;{'h'}]; 
           i=i+1;
        elseif strcmpi(str(i),'j')
            grapheme=[grapheme;{'j'}];  
        elseif strcmpi(str(i:i+1),'ge')||strcmpi(str(i:i+1),'jj')
            grapheme=[grapheme;{'j'}];
            i=i+1;
%         elseif strcmpi(str(i:i+2),'dge')
%            grapheme=[grapheme;{'j'}];
%            i=i+2;
        elseif strcmpi(str(i),'k')||strcmpi(str(i),'c')
            grapheme=[grapheme;{'k'}]; 
        elseif strcmpi(str(i),'ck')||strcmpi(str(i),'cc')||strcmpi(str(i),'qu')
            grapheme=[grapheme;{'k'}]; 
            i=i+1;
        elseif strcmpi(str(i),'l')
            grapheme=[grapheme,{'l'}];
        elseif strcmpi(str(i),'ll')||strcmpi(str(i),'le')
            grapheme=[grapheme,{'l'}];
           i=i+1;
        elseif strcmpi(str(i),'m')
            grapheme=[grapheme,{'m'}];
        elseif strcmpi(str(i),'mm')||strcmpi(str(i),'lm')||strcmpi(str(i),'mn')
            grapheme=[grapheme;{'m'}]; 
            i=i+1;
       elseif strcmpi(str(i),'n')
            grapheme=[grapheme,{'n'}];
       elseif strcmpi(str(i),'nn')||strcmpi(str(i),'kn')||strcmpi(str(i),'pn')||strcmpi(str(i),'gn')
            grapheme=[grapheme;{'n'}]; 
            i=i+1;
        elseif strcmpi(str(i),'p')
            grapheme=[grapheme;{'p'}];
        elseif strcmpi(str(i),'pp')
            grapheme=[grapheme;{'p'}];
            i=i+1;
        elseif strcmpi(str(i),'r')
            grapheme=[grapheme;{'r'}];
        elseif strcmpi(str(i),'rr')||strcmpi(str(i),'wr')||strcmpi(str(i),'rh')
            grapheme=[grapheme;{'r'}]; 
            i=i+1;
        elseif strcmpi (str(i),'s')
            grapheme=[grapheme;{'s'}];
         elseif strcmpi(str(i),'ss')||strcmpi(str(i),'sc')||strcmpi(str(i),'ce')||strcmpi(str(i),'se')||strcmpi(str(i),'ps')
            grapheme=[grapheme;{'s'}]; 
            i=i+1; 
         elseif strcmpi (str(i),'t')
            grapheme=[grapheme;{'t'}];
         elseif strcmpi(str(i),'tt')|| strcmpi(str(i),'te')
            grapheme=[grapheme;{'t'}]; 
            i=i+1; 
         elseif strcmpi (str(i),'v')
            grapheme=[grapheme;{'v'}];
         elseif strcmpi(str(i),'ve')
            grapheme=[grapheme;{'v'}]; 
            i=i+1;
         elseif strcmpi (str(i),'v')
            grapheme=[grapheme;{'v'}];
         elseif strcmpi(str(i),'ve')
            grapheme=[grapheme;{'v'}]; 
            i=i+1;
         elseif strcmpi (str(i),'w')
            grapheme=[grapheme;{'w'}];
         elseif strcmpi(str(i),'wh')
            grapheme=[grapheme;{'w'}]; 
            i=i+1;
         elseif strcmpi(str(i),'x')
            grapheme=[grapheme;{'x'}];
        elseif strcmpi (str(i),'y') && strcmp('I')
            grapheme=[grapheme;{'y'}];
        elseif strcmpi(str(i),'z')
            grapheme=[grapheme;{'z'}];
        elseif strcmpi(str(i),'zz')||strcmpi(str(i),'ze')
            grapheme=[grapheme;{'z'}]; 
            i=i+1;
          elseif strcmpi(str(i),'sh')
            grapheme=[grapheme;{'sh'}]; 
            i=i+1;
         elseif strcmpi(str(i),'zh')
            grapheme=[grapheme;{'z'}]; 
            i=i+1;
         elseif strcmpi(str(i),'ch')
            grapheme=[grapheme;{'ch'}]; 
            i=i+1;
         elseif strcmpi(str(i),'th')
            grapheme=[grapheme;{'th'}]; 
            i=i+1;
         elseif strcmpi(str(i),'a')
            grapheme=[grapheme;{'a'}]; 

         elseif strcmpi(str(i),'e')
            grapheme=[grapheme;{'e'}]; 
         elseif strcmpi(str(i),'i')
            grapheme=[grapheme;{'i'}]; 
         elseif strcmpi(str(i),'o')
            grapheme=[grapheme;{'o'}]; 
         elseif strcmpi(str(i),'u')
            grapheme=[grapheme;{'u'}];        
       end
       end
display(grapheme);
Meraki
  • 33
  • 1
  • 5
  • Could you try with the simplest string input with which it should work and see if it's still stuck in a limbo-ish while-loop? I tried with an input string of `'hg'` and it's stuck inside. Are there any valid string criteria to be maintained? – Divakar Mar 26 '16 at 19:20
  • I see a few issues: `strcmpi(str(i),'th')` and other two-letter comparisons. Also stuff like `str(i:i+1)` and `str(i:i+2)` could go out of bound.. Also this is clunky `grapheme=[grapheme;{'d'}]`, better to preallocate – Amro Mar 26 '16 at 19:29
  • @Divakar Yes,even the simplest of strings are stuck.. I had tried with 'bd' and it showed 'busy'. As for string criteria, there has to be just alphabets for input (preferably lower case). Haven't validated anything as yet. – Meraki Mar 26 '16 at 21:05
  • @Amro Yes, there are issues. The biggest problem is that the console is not getting anywhere to even be able to throw any errors. My problem is that I'm aware of the inevitable out of bounds exceptions it may throw. However, I have no idea how to remove it and get my work done anyway. I need to check for double or even triple letters to find similar sounding graphemes. – Meraki Mar 26 '16 at 21:09
  • debug it then starting from the bottom up. Comment out all sections of if/else, and add them one by one, testing each time with an input string, until you see what's causing a possibly infinite loop.. At first only consider single letter comparisons, once you get that working, figure out how to add 2 and 3 letters by checking bounds correctly... – Amro Mar 26 '16 at 21:18
  • @Amro Thanks for your suggestion. Could you also suggest me if I could use the concepts of datasets or anything similar a feature in Matlab for efficient storage of the rules? – Meraki Mar 28 '16 at 16:30
  • sure, you could do that to shorten the code, i.e storing the rules as pairs of test conditions and actions. That way you could loop over the rules and check with one if-statement inside the loop. But this is only about refactoring the code, not really about performance... – Amro Mar 29 '16 at 14:50
  • I got that. Thank you so much for your guidance, I've managed to debug it appropriately. :) – Meraki Mar 30 '16 at 13:49
  • Alright. By the way if you found the problem, you can answer your own question. – Amro Mar 30 '16 at 16:23

1 Answers1

0
prompt = 'Enter a string : ';
`enter code here`str = input(prompt,'s');
l=length(str);
i=1;
f=0;  
grapheme=[]; %matrix to store the graphemes

    enter code here

if (l==1)
    if strcmp(str(i),'I')
            grapheme=[grapheme;{'ie'}];
    else
    grapheme=[grapheme;{str(l)}];
    end
else
while i<=l-1
   if double(str(i))==32
      grapheme=[grapheme;{'*'}];
   elseif strcmp(str(i),'I')
            grapheme=[grapheme;{'ie'}];

   elseif strcmpi(str(i:i+1),'sh')
       grapheme=[grapheme;{'sh'}];
       i=i+1;
   elseif strcmpi(str(i:i+1),'ee')
       grapheme=[grapheme;{'ee'}];
       i=i+1;
    elseif strcmpi(str(i:i+1),'ea')
       grapheme=[grapheme;{'ee'}];
       i=i+1;
    elseif strcmpi(str(i:i+1),'oa')
       grapheme=[grapheme;{'oa'}];
       i=i+1;
    elseif strcmpi(str(i:i+1),'ou')
       grapheme=[grapheme;{'ou'}];
       i=i+1;
    elseif strcmpi(str(i:i+1),'oo')
       grapheme=[grapheme;{'oo'}];
       i=i+1;
    elseif strcmpi(str(i:i+1),'er')
       grapheme=[grapheme;{'er'}];
       i=i+1;
    elseif strcmpi(str(i:i+1),'bb')
       grapheme=[grapheme;{'b'}];
       i=i+1;

     elseif strcmpi(str(i:i+1),'ll')||strcmpi(str(i:i+1),'le')
       grapheme=[grapheme;{'l'}];
       i=i+1;
     elseif strcmpi(str(i:i+1),'ff')
       grapheme=[grapheme;{'f'}];
       i=i+1;
     elseif strcmpi(str(i:i+1),'dd')||strcmpi(str(i:i+1),'ed')
            grapheme=[grapheme;{'d'}];
            i=i+1;
     elseif strcmpi(str(i:i+1),'ff')||strcmpi(str(i:i+1),'ph')||strcmpi(str(i:i+1),'gh')
            grapheme=[grapheme;{'f'}];
     elseif  strcmpi(str(i:i+1),'gg')%||strcmpi(str(i:i+2),'gue')
           grapheme=[grapheme;{'g'}]; 
           i=i+1; 
     elseif strcmpi(str(i:i+1),'ge')||strcmpi(str(i:i+1),'jj')
            grapheme=[grapheme;{'j'}];
            i=i+1;
    elseif strcmpi(str(i:i+1),'ck')||strcmpi(str(i:i+1),'cc')||strcmpi(str(i:i+1),'qu')
            grapheme=[grapheme;{'c'}];
            i=i+1;
     elseif strcmpi(str(i:i+1),'mm')||strcmpi(str(i:i+1),'lm')||strcmpi(str(i:i+1),'mn')
            grapheme=[grapheme;{'m'}]; 
            i=i+1;
     elseif strcmpi(str(i:i+1),'nn')||strcmpi(str(i:i+1),'kn')||strcmpi(str(i:i+1),'pn')||strcmpi(str(i:i+1),'gn')
            grapheme=[grapheme;{'n'}]; 
            i=i+1;
     elseif strcmpi(str(i:i+1),'pp')
            grapheme=[grapheme;{'p'}];
            i=i+1;
     elseif strcmpi(str(i:i+1),'rr')||strcmpi(str(i:i+1),'wr')||strcmpi(str(i:i+1),'rh')
            grapheme=[grapheme;{'r'}]; 
            i=i+1
      elseif strcmpi(str(i:i+1),'ss')||strcmpi(str(i:i+1),'sc')||strcmpi(str(i:i+1),'ce')||strcmpi(str(i:i+1),'se')||strcmpi(str(i:i+1),'ps')
            grapheme=[grapheme;{'s'}]; 
            i=i+1; 
      elseif strcmpi(str(i:i+1),'tt')|| strcmpi(str(i),'te')
            grapheme=[grapheme;{'t'}]; 
            i=i+1; 
      elseif strcmpi(str(i:i+1),'ve')
            grapheme=[grapheme;{'v'}]; 
            i=i+1;
      elseif strcmpi(str(i:i+1),'wh')
            grapheme=[grapheme;{'w'}]; 
            i=i+1;
       elseif strcmpi(str(i:i+1),'zz')
            grapheme=[grapheme;{'z'}]; 
            i=i+1;
       elseif strcmpi(str(i:i+1),'ch')
            grapheme=[grapheme;{'ch'}]; 
            i=i+1;
       elseif strcmpi(str(i:i+1),'th')
            grapheme=[grapheme;{'th'}]; 
            i=i+1;
        elseif strcmpi(str(i:i+1),'oy')
            grapheme=[grapheme;{'oy'}]; 
            i=i+1;
        elseif strcmpi(str(i:i+1),'ai')
            grapheme=[grapheme;{'ai'}]; 
            i=i+1;        
        elseif strcmpi(str(i:i+1),'ou')
            grapheme=[grapheme;{'ou'}]; 
            i=i+1;
        elseif strcmpi(str(i:i+1),'ew')
            grapheme=[grapheme;{'ew'}]; 
            i=i+1;
        elseif strcmpi(str(i:i+1),'ie')
            grapheme=[grapheme;{'ie'}]; 
            i=i+1;
   else

     grapheme=[grapheme;{str(i)}];
     %end
   end
   i=i+1;
   if i==l
       f=1;
   end
end
end
if f==1
   grapheme=[grapheme;{str(end)}]; 
end
display(grapheme);
Meraki
  • 33
  • 1
  • 5