Program to do word/phrase replacement exceeds the time limit

Question

Sorry for the possibly long and dumb question, but I'm really stumped. I'm doing a task for the university. Its meaning is very simple. You need to implement a function that will change the "bad" phrases to "good". Input to the function is a text and a double array with good and bad words (in the left column the words that need to be replaced, and on the right column the words to be inserted instead of the bad words). The dictionary itself with bad and good words can have any size, but at the end there will always be a pair of NULL - NULL.

It is important to note that the program should not do anything to change the already replaced phrases. The line "termination specialist" contains the word "specialist", so the program must check to see if there are any words in the text that have already been replaced, so that the line "termination specialist" does not change into the line "termination person with certified level of knowledge". The check happens here.

The program must also make sure that the entered dictionary of good and bad words is correct, which means that a bad word cannot be the beginning of another bad word. This check happens in the function replaceInvalidity

Text and dictionary with words do not have to be meaningful. In the context of this task, it is simply a set of symbols, i.e. letters, numbers, symbols

I wrote a program that passes most of the tests, but for some reason at one of the tests it loops and exceeds the time limit (2 seconds). As a result, I get 0 points for the whole task.

I tried checking the memory with Valgrind, but it did not show any errors.

Full code:

#ifndef __PROGTEST__
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <assert.h>
#endif /* __PROGTEST__ */

int replaceInvalidity(const char * (*replace)[2])
{
    int size = 0;
    for (int i = 0; replace[i][0] != NULL; i++)
        size++;
    for (int i = 0; i < size - 1; i++)
    {
        for (int j = i + 1; j < size; j++)
        {
            if (strlen(replace[i][0]) >= strlen(replace[j][0]))
            {
                if (strstr(replace[i][0], replace[j][0]) == replace[i][0])
                    return 1;
            }
            else
            {
                if (strstr(replace[j][0], replace[i][0]) == replace[j][0])
                    return 1;
            }
        }
    }
    return 0;
}

char *newSpeak(const char *text, const char * (*replace)[2])
{
    if (replaceInvalidity(replace))
    {
        return NULL;
    }

    int i = 0, k = 0, flag= 0, Nlen = 0, Olen = 0, length = 0;
    char *result = (char *)malloc(sizeof(char));
    length = strlen(text);

    for (i = 0, k = 0; i < length; i++, k++)
    {
        flag = 0;
        for (int j = 0; replace[j][1] != NULL; j++)
        {
            if (strstr(&text[i], replace[j][1]) == &text[i])
            {
                Nlen = strlen(replace[j][1]);
                result = (char *)realloc(result, ((k + Nlen + 1) * sizeof(char)));
                for (int l = k; l < k + Nlen; l++)
                    result[l] = replace[j][1][l-k];
                i += Nlen - 1;
                k += Nlen - 1;
                flag = 1;
                break;
            }
        }

        if (flag) continue;

        for (int j = 0; replace[j][0] != NULL; j++)
        {
            if (strstr(&text[i], replace[j][0]) == &text[i])
            {
                Olen = strlen(replace[j][0]);
                Nlen = strlen(replace[j][1]);
                result = (char *)realloc(result, ((k + Nlen + 1) * sizeof(char)));
                for (int l = k; l < k + Nlen; l++)
                    result[l] = replace[j][1][l-k];
                i += Olen - 1;
                k += Nlen - 1;
                flag = 1;
                break;
            }
        }

        if (flag) continue;

        result = (char *)realloc(result, (k + 2) * sizeof(char));
        result[k] = text[i];
    }
    result[k] = '\0';
    return result;
}

#ifndef __PROGTEST__
int main(int argc, char * argv[])
{
    char *res;

    const char * d1[][2] = {
        { "murderer", "termination specialist" },
        { "failure", "non-traditional success" },
        { "specialist", "person with certified level of knowledge" },
        { "dumb", "cerebrally challenged" },
        { "teacher", "voluntary knowledge conveyor" },
        { "evil", "nicenest deprived" },
        { "incorrect answer", "alternative answer" },
        { "student", "client" },
        { NULL, NULL }
    };

    const char * d2[][2] = {
        { "fail", "suboptimal result" },
        { "failure", "non-traditional success" },
        { NULL, NULL }
    };

    res = newSpeak("dumb termination specialist.", d1);
    assert(!strcmp(res, "cerebrally challenged termination specialist."));
    free(res);
  
    res = newSpeak("The student answered an incorrect answer.", d1);
    assert(!strcmp(res, "The client answered an alternative answer."));
    free(res);

    res = newSpeak("He was dumb, his failure was expected.", d1);
    assert(!strcmp(res, "He was cerebrally challenged, his non-traditional success was expected."));
    free(res);

    res = newSpeak("The evil teacher became a murderer.", d1);
    assert(!strcmp(res, "The nicenest deprived voluntary knowledge conveyor became a termination specialist."));
    free(res);

    res = newSpeak("Devil's advocate.", d1);
    assert(!strcmp(res, "Dnicenest deprived's advocate."));
    free(res);

    res = newSpeak("Hello.", d2);
    assert(!res);

    return EXIT_SUCCESS;
}
#endif /* __PROGTEST__ */

It is a lot more helpful to post a [mre] then 3 snippets of code. — Allan Wind, Dec 05 '22 at 02:27
The way you use realloc is unsafe. If it fails with NULL you leak original memory. — Allan Wind, Dec 05 '22 at 02:32
Pasting those 3 snippets together and adding the missing header files I cannot reproduce the issue. — Allan Wind, Dec 05 '22 at 02:33
It's silly that you allocate 1 byte to result. Might as well just initialize it to NULL. — Allan Wind, Dec 05 '22 at 02:39
I understand why you want to search your `text` for `replace[i][0]` but why do you search for the good words `replace[?][1]`? `strstr()` searches the whole string, why do you do that for every character in `text`? — Allan Wind, Dec 05 '22 at 02:52
In your hard-coded test case (great) why should "specialist" not be replaced? — Allan Wind, Dec 05 '22 at 03:47
I'm looking for good words, since I'm not allowed to replace them. In the example "murderer" is replaced by "termination specialist". But there is also a bad word "specialist" in the table, which will be replaced by a "person with certified level of knowledge". As a result of the second pass I will get the string "cerebrally challenged termination person with certified level of knowledge" — w.ilia.m, Dec 05 '22 at 10:34
Please update your question with those additional requirements you just introduced (I see the extra test cases which is awesome). "word" means something specific (usually [a-z][A-Z] and maybe '-' but probably not punctuation marks). Do mean word or phrase? And are you saying that only want one replacement per call to `newSpeak()`? — Allan Wind, Dec 05 '22 at 21:22
Good job on revising the question based on input. It's a little verbose so my advise is to cut out all but the last code segment. I still not able to reproduce a test. Does the __PROGTEST__ suite emit any information at all other than the time out? Also, drop a comment on my answer, to explain why it doesn't work (that way author gets a notification). I still don't see an explanation for why your first test case shouldn't replace specialist. — Allan Wind, Dec 06 '22 at 03:46

Allan Wind · Answer 1 · 2022-12-05T07:05:40.370

I was not able to reproduce the issue after adding the missing includes and combining your 3 snippets. As you phrase the question as a performance issue, I reworked your code to reduce run-time from 0.476 s to 0.275 s per 1e6 calls.

Instead of calling strstr() per character of your input text for a given bad word, only call it once + number of times a given bad word is found in text. Proceed processing text after the replacement. This should make make a significant difference for large input.
Instead of using loops move data in your string use memmove() which is highly optimized.
Instead of calling realloc() for each replacement when the size of the replacement changes.
Removed replaceInvalidity() as I think you were protecting yourself of the replacement string being substring of the input to avoid an infinite loop. The implementation below avoid that by only looking at replacements after the fact.
result = realloc(result, ...) will leak memory on failure so handle the error by free'ing the original string and return NULL on error. strdup() error is handled similarly.
Problem description does not match your test case, so I revised the test case. If this is not correct please clarify expected behavior (only replace at most 1 bad word?).

#define _XOPEN_SOURCE 500
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char *newSpeak(const char *text, const char *(*replace)[2]) {
    char *result = strdup(text);
    if(!result)
        return NULL;
    size_t result_len = strlen(result);
    for(size_t i = 0; replace[i][0] && replace[i][0][0] && replace[i][1] && replace[i][1][0]; i++) {
        size_t bad_len = strlen(replace[i][0]);
        size_t good_len = strlen(replace[i][1]);
        char *found = result;
        for(;;) {
            found = strstr(found, replace[i][0]);
            if(!found)
                break;
            size_t offset = found - result;
            if(bad_len < good_len) {
                char *tmp = realloc(result, result_len + good_len - bad_len + 1);
                if(!tmp) {
                    free(result);
                    return NULL;
                }
                result = tmp;
                found = result + offset;
                memmove(found + good_len, found + bad_len, result_len - offset - bad_len + 1);
            } else if(bad_len > good_len) {
                memmove(found + good_len, found + bad_len, result_len - offset - bad_len + 1);
                char *tmp = realloc(result, result_len + good_len - bad_len + 1);
                if(!tmp) {
                    free(result);
                    return NULL;
                }
                result = tmp;
                found = result + offset;
            }
            result_len += good_len - bad_len;
            memcpy(found, replace[i][1], good_len);
            found += good_len;
        }
    }
    return result;
}

int main(void) {
    const char *d1[][2] = {
        { "murderer", "termination specialist" },
        { "failure", "non-traditional success" },
        { "specialist", "person with certified level of knowledge" },
        { "dumb", "cerebrally challenged" },
        { "teacher", "voluntary knowledge conveyor" },
        { "evil", "nicenest deprived" },
        { "incorrect answer", "alternative answer" },
        { "student", "client" },
        { NULL, NULL }
    };
    char *res = newSpeak("dumb termination specialist.", d1);
    assert(!strcmp(res, "cerebrally challenged termination person with certified level of knowledge."));
    free(res);
}

score 0 · Answer 2 · answered Dec 16 '22 at 10:38

Few suggestions:

There is a lot of unnecessary string traversal in replaceInvalidity() function. Functions like strlen(), strstr() etc., use them only when they are really needed, otherwise avoid them. Also, if the dictionary is supposed to be end with NULL then this is not needed:

for (int i = 0; replace[i][0] != NULL; i++) size++;

Use the check for NULL termination of dictionary directly in the for loop condition, instead of, first calculate the size and then use it.

The program must also make sure that the entered dictionary of good and bad words is correct, which means that a bad word cannot be the beginning of another bad word.

For this, you are using strstr() and it will parse the whole string to find out the substring even if the their first character does not match.
If a bad word cannot be the beginning of another bad word then simply start compare their characters from start and if any of the string reaches to end will make the dictionary invalid otherwise not.

In newSpeak() function, your program iterating the input string text character by character and for every character, first it is finding the whole dictionary good phrases as substring and if it is not found then same activity for whole dictionary bad phrases. If the input phrase is big and if there are too many number of elements in dictionary, this is going to take a lot of time in processing. You should think of something better here, may be - extract a word from input text and search for that word in dictionary and based on whole or partial or no match found in dictionary, process further.

You can do something like this ( below code is just for demonstration purpose):

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <ctype.h>

#define A_B_EQUAL       0
#define A_SUBSTR        1
#define B_SUBSTR        2
#define A_B_NOTEQUAL    3


int strMatch (const char * a, const char * b) {
    if (!a || !b) {
        return A_B_NOTEQUAL;
    }

    while(*a && (*a == *b)) {
        a++; b++;
    }

    if ((*a == '\0') && (*b == '\0')) {
        return A_B_EQUAL;
    } else if (*a == '\0') {
        return A_SUBSTR;
    } else if (*b == '\0') {
        return B_SUBSTR;
    }

    return A_B_NOTEQUAL;
}

int replaceInvalidity (const char * (*replace)[2]) {
    for (int i = 0; replace[i][0] && replace[i + 1][0]; i++) {
        for (int j = i + 1; replace[j][0]; j++) {
            if (strMatch (replace[j][0], replace[i][0]) != A_B_NOTEQUAL) {
                fprintf (stdout, "Invalid entries found in dictionary - [%s, %s]\n", replace[j][0], replace[i][0]);
                return 1;
            }
        }
    }

    return 0;
}

int findInDict (const char * (*replace)[2], const char * ps, int len) {
    if (!replace || !ps || !len) {
        fprintf (stderr, "(%s):Invalid argument...\n", __func__);
        return -1;
    }

    int index = -1;
    for (int i = 0; replace[i][0] && (index == -1); i++) {
        if (strncmp (replace[i][0], ps, len) == 0) {
            index = i;
        }
        if ((index != -1) && (replace[index][0][len] != '\0')) {
            //dictionary entry partially matched, match rest
            int res = strMatch (&ps[len], &replace[index][0][len]);
            if ((res != A_B_EQUAL) && (res != B_SUBSTR)) {
                index = -1;
            }
        }
    }

    return index;
}

char * newSpeak (const char * text, const char * (*replace)[2]) {
    if ((!text) || (replaceInvalidity(replace))) {
        fprintf (stderr, "(%s):Invalid argument...\n", __func__);
        return NULL;
    }

    char * result = NULL;
    int resultlen = 0;

    while (*text) {
        int ws_and_oc = 0;
        int curr_text_len = 0;
        const char * start = text;
        const char * str = NULL;

        while (isspace (*text) || !isalpha (*text)) {
            ws_and_oc++; text++;
        }

        while (isalpha (*text)) {
            curr_text_len++; text++;
        }

        int dict_index = findInDict (replace, start + ws_and_oc, curr_text_len);

        if (dict_index >= 0) {
            int len = strlen (replace [dict_index][0]);
            // adjust the text pointer and curr_text_len when the dictionary bad word is a phrase and not just a word
            text = (((text - start - ws_and_oc) == len) ? text : start + len + ws_and_oc);
            curr_text_len = strlen (replace [dict_index][1]);
            str = replace [dict_index][1];
        } else {
            str = start + ws_and_oc;
        }

        char * tmp;
        result = realloc (tmp = result, resultlen + curr_text_len + ws_and_oc + 1);
        if (result == NULL) {
            fprintf (stderr, "(%s:%d):Failed to allocate memory...\n", __func__, __LINE__);
            free (tmp);
            return NULL;
        }

        for (int i = 0; i < ws_and_oc; ++i) {
            result[resultlen++] = start[i];
        }

        for (int i = 0; i < curr_text_len; ++i) {
            result[resultlen++] = str[i];
        }
    }

    result[resultlen] = '\0';
    return result;
}

int main (void) {
    char * res;

    const char * d1 [][2] = {
            { "murderer", "termination specialist" },
            { "failure", "non-traditional success" },
            { "specialist", "person with certified level of knowledge" },
            { "dumb", "cerebrally challenged" },
            { "teacher", "voluntary knowledge conveyor" },
            { "evil", "nicenest deprived" },
            { "incorrect answer", "alternative answer" },
            { "student", "client" },
            { NULL, NULL }
        };

    res = newSpeak ("dumb termination specialist.", d1);
    if (res) {
        assert (!strcmp (res, "cerebrally challenged termination person with certified level of knowledge."));
        free (res);
    }
  
    res = newSpeak ("The student answered an incorrect answer.", d1);
    if (res) {
        assert (!strcmp ( res, "The client answered an alternative answer."));
        free (res);
    }

    res = newSpeak ("He was dumb, his failure was expected.", d1);
    if (res) {
        assert (!strcmp ( res, "He was cerebrally challenged, his non-traditional success was expected."));
        free (res);
    }

    res = newSpeak ("The evil teacher became a murderer.", d1);
    if (res) {
        assert (!strcmp ( res, "The nicenest deprived voluntary knowledge conveyor became a termination specialist."));
        free (res);
    }

    return 0;
}

I have skipped a couple of test cases because of lack of clarity -

The first one is:

  res = newSpeak ( "Devil's advocate.", d1 );
  assert ( ! strcmp ( res, "Dnicenest deprived's advocate." ) );
  free ( res );

here a substring of a word of phrase, which exists in dictionary, is replaced with good phrase. What if Devil is also exists in the dictionary? What should be the behaviour in this case? Should look for best match or first match (even partial will work fine)..?
May be, once you have clarity around it, you can make the appropriate changes in the findInDict() function.

and second is:

  res = newSpeak ( "Hello.", d2 );
  assert ( ! res );

Why this test case expect res to be NULL? Based on the information you have provide res should be Hello. and not NULL.

Program to do word/phrase replacement exceeds the time limit

2 Answers2