0

My goal is to analyze a text file, tokenize each word, then alphabetize each word with its word frequency.

Example: 
Input: The house is on the ground on earth.

Output:
earth - 1
ground - 1
house - 1 
is - 1
on - 2
the - 2

I have been able to open the file, read the file line by line, tokenize each word, converted the tokens to lowercase. I am stuck grouping and alphabetizing each token.

#include <stdio.h>
#include <stdlib.h>

void lower_string(char s[]);

int main()
{
    FILE *file;
    //char path[100];
    char ch[100];
    int characters;

    /* Input path of files to merge to third file
    printf("Enter source file path: ");
    scanf("%s", path);
    file = fopen(path, "r");*/
    file = fopen("test.txt", "r"); //testing w.o repeated input


    /* Check if file opened successfully */
    if (file == NULL)
    {
        printf("\nUnable to open file.\n");
        printf("Please check if file exists and you have read privilege.\n");

        exit(EXIT_FAILURE);
    }

    const char delim[] = " ,.;!?[\n]";
    char *token;
    int tokenNum;


    while (fgets(ch, sizeof(ch), file) != NULL)
    {
        lower_string(ch);
        token = strtok(ch, delim);

        while (token != NULL)
        {
            printf("Token:%s\n", token);
            token = strtok(NULL, delim);
            tokenNum++;
        }
    }


    printf("%d\n", tokenNum);  //total words testing
    /* Close files to release resources */
    fclose(file);

    return 0;
}
void lower_string(char s[])
{
    int c = 0;
    while (s[c] != '\0')
    {
        if (s[c] >= 'A' && s[c] <= 'Z')
        {
            s[c] = s[c] + 32;
        }
        c++;
    }
}

I have been looking into building and manipulating an ordered linked list of integers and binary search tree of integers. I'm having a hard time figuring out where I should begin to implement these features. So far i have been looking at the code below for ordered linked list.

#include <stdio.h>
#include <stdlib.h>
//These structures are declared globally so they are available to all functions
//in the program.
typedef struct list_node_s
{               //defines structure of one node
    int key;            //key value - here an integer
    int count;          //frequency key value encountered in input
    struct list_node_s *restp;  //pointer to the next node in list = NULL if EOL

} list_node_t;

typedef struct              //defines head of list structure
{
    list_node_t *headp;     //pointer to first node in list, NULL if list is empty
    int size;               //current number of nodes in the list
} ordered_list_t;

//Prototypes
list_node_t * insert_in_order (list_node_t * old_listp, int new_key);
void insert (ordered_list_t * listp, int key);
int delete (ordered_list_t * listp, int target);
list_node_t * delete_ordered_node (list_node_t * listp, int target,int *is_deleted);

void print_list (ordered_list_t * listp);

#define SEND -999       //end of input sentinal
int main (void)
{
    int next_key;
    ordered_list_t my_list = {NULL, 0};

    printf("\n\nProgram to build, display and manipulate (delete) an Ordered Linked List \n");
    printf("\nAdapted from code in \"Problem Solving and Programming in C\" by J.R. Hanly and E.B. Koffman\n\n");
    printf ("enter integer keys - end list with %d\n", SEND);

/* build list by in-order insertions*/
    for (scanf ("%d", &next_key);
    next_key != SEND;
    scanf ("%d", &next_key))
    {
        insert (&my_list, next_key);
    }

/* Display completed list */
    printf ("\nOrdered list as built:\n");
    print_list(&my_list);

/* Process requested deletions */
    printf("enter key value for node to be removed from list or %d to end > ", SEND);

    for (scanf ("%d", &next_key);
    next_key != SEND;
    scanf ("%d", &next_key))
    {
        if (delete (&my_list, next_key))
        {
            printf ("%d deleted.\n  New list:\n", next_key);
            print_list (&my_list);
        }
        else
        {
            printf ("No deletion.  %d not found\n", next_key);
        }

    printf ("enter key value for node to be removed from list or %d to end > ", SEND);
    }
    return (0);
}

/* prints contents of a linked list Display the elements in the list pointed to by the pointer list.*/
void print_list (ordered_list_t * listp)
{
    list_node_t * tmp;
    for (tmp = listp->headp; tmp != NULL; tmp = tmp->restp)
    printf ("key = %d;  count = %d\n", tmp->key, tmp->count);
    printf ("\n\n");
}

//Inserts a new node containing new_key into an existing list and returns a pointer to the first node of the new list
list_node_t * insert_in_order (list_node_t * old_listp, int new_key)
{
    list_node_t * new_listp;
    if (old_listp == NULL)                  //check for end of list (EOL)
    {
        new_listp = (list_node_t *) malloc (sizeof (list_node_t));
        new_listp->key = new_key;
        new_listp->count = 1;
        new_listp->restp = NULL;
    }
    else if (old_listp->key == new_key)     //check for matching key, increment count
    {
        old_listp->count++;
        new_listp = old_listp;
    }
    else if (old_listp->key > new_key)      //Next node key value > new key, so insert new node at current location
    {
        new_listp = (list_node_t *) malloc (sizeof (list_node_t));
        new_listp->key = new_key;
        new_listp->count = 1;
        new_listp->restp = old_listp;
    }
    else
    {
        new_listp = old_listp;
        new_listp->restp = insert_in_order (old_listp->restp, new_key);
    }
return (new_listp);
}
//inserts a node into an ordered list_node_t
void insert (ordered_list_t * listp, int key)
{
++(listp->size);
listp->headp = insert_in_order (listp->headp, key);
}
//deletes the first node containing the target key from an ordered list; returns 1
//if target found & deleted, 0 otherwise (means target not in list)
int delete (ordered_list_t * listp, int target)
{
    int is_deleted;
    listp->headp = delete_ordered_node (listp->headp, target, &is_deleted);
    if (is_deleted)
     --(listp->size);       //reduce current node count (size); keep size of list current
    return (is_deleted);
}

/* deletes node containing target key from a list whose head is listp; returns a pointer
   to the modified list (incase it is the first node, pointed to by listp), frees
   the memory used by tyhe deleted node and sets a flag to indicate success (1) or
   failure (0; usually means no such node found).
*/

list_node_t * delete_ordered_node (list_node_t * listp, int target, int *is_deleted)
{
    list_node_t *to_freep, *ansp;

    // if list empty, nothing to do; return NULL
    printf ("check for empty list; target: %d \n", target);
    if (listp == NULL)
    {
        *is_deleted = 0;
        ansp = NULL;
    }

    //if first node is to be deleted, do it; relink rest of list to list header struct
    else if (listp->key == target)
    {
        printf ("at first node; target: %d \n", target);
        *is_deleted = 1;
        to_freep = listp;       //keeps track of node memory location to be freed
        ansp = listp->restp;
        free (to_freep);        //release the memory of the deleted node for reuse
    }

    //if target exists, it is further down the list (recursive step), make recursive call
    //to move down the list looking for the target value
    else
    {
        printf ("chase down list to find: %d \n", target);
        ansp = listp;
        ansp->restp = delete_ordered_node (listp->restp, target, is_deleted);
    }
    return (ansp);
}

I'm finding it hard to implement that with strtok.

12/4 EDIT: added: Nodes for BST. Questions-

  1. Don't know if key needs to be tracked.(I assume it'll be useful to pull specific words).
  2. Where/how would I add the logic to alphabetize the tree.(study sources appreciated)
  3. How do I pass each word through this tree?
#define WLENGTH 100
//Base Node info
struct node 
{
    char word[WLENGTH];
    int key;
    int freq;
    struct node *left, *right;
};

//Function to create a new node
struct node *newNode(char wordn, int item, int freqn) 
{
    struct node *temp = (struct node *) malloc(sizeof(struct node));
    temp->word = wordn;
    temp->key = item;
    temp->freq = freqn;
    temp->left = temp->right = NULL;
    return temp;
}

//Function to place nodes in order 
void inorder(struct node *root) 
{
    if (root != NULL) 
    {
        inorder(root->left);
        printf("%d ", root->key);
        inorder(root->right);
    }
}

/*Function to insert a new node with given key*/
struct node* insert(struct node* node, int key) 
{
    /* If the tree is empty, return a new node */
    if (node == NULL)
        return newNode(key);

    /* Otherwise, recur down the tree */
    if (key < node->key)
        node->left = insert(node->left, key);
    else if (key > node->key)
        node->right = insert(node->right, key);

    /* return the (unchanged) node pointer */
    return node;
}
NM215
  • 1
  • 1
  • Note: `const char delim[20] = " ,.;!?[\n\0]";` same as `const char delim[20] = " ,.;!?[\n";` as `strtok()` stops at the first _null character_. `']'` is not a delimiter. I think you want `const char delim[] = " ,.;!?[\n]";`. – chux - Reinstate Monica Dec 04 '22 at 03:08
  • Aside: Suggested simplifiaciton: `new_listp = (list_node_t *) malloc (sizeof (list_node_t));` --> `new_listp = malloc(sizeof new_listp[0]);`. – chux - Reinstate Monica Dec 04 '22 at 03:16
  • A BST (binary search tree) would likely be the simplest solution; each node holding a pointer to its "word" and a count of occurrences. You may consider getting the size of the file and reading its contents into a single buffer instead of 'piecemeal'; (line-by-line) where you need to also allocate/free buffers for each unique word "remembered". Run `strtok()` across the entire buffer and the job is almost done... – Fe2O3 Dec 04 '22 at 04:30
  • ReinstateMonica, ty I made those changes to the code above. I was thinking that the null terminator would've caused a problem but it doesn't seem to affect the output which is good. @Fe2O3 yea I was afraid of BST. Could you give me an example for reading a file via a buffer. Sorry I'm fairly new to C and just need a variety of content to look over. – NM215 Dec 04 '22 at 19:01

1 Answers1

0

At the request of the OP, here is a bit of code to bulk load an entire text file for processing:

FILE *mustOpen( char *fname, char *mode ) {
    FILE *fp = fopen( fname, mode );
    if( fp == NULL ) {
        fprintf( stderr, "Cannot open '%s'\n", fname );
        exit( EXIT_FAILURE );
    }
    return fp;
}

// Passed the path to a file, opens, measures and bulk loads entire file (plus terminating '\0')
char *loadFile( char *fname ) {
    FILE *fp = mustOpen( fname, "rb" );

    fseek( fp, 0, SEEK_END );
    size_t size = ftell( fp );
    fseek( fp, 0, SEEK_SET );

    char *buf;
    if( ( buf = malloc( size + 1) ) == NULL )
        fprintf( stderr, "Malloc() failed\n" ), exit( EXIT_FAILURE );

    if( fread( buf, sizeof *buf, size, fp ) != size )
        fprintf( stderr, "Read incomplete\n" ), exit( EXIT_FAILURE );

    fclose( fp );

    *(buf + size) = '\0'; // xtra byte allows strXXX() to work

    return buf; // pointer to heap allocated buffer containing file's bytes
}

Remember to free() the buffer when done with its contents.

With the entire text loaded (and NULL terminated), here is a way to skip along the entire "string" finding each "word" (as defined by the delimiters):

for( char *cp = buf; (cp = strtok( cp, delim )) != NULL; cp = NULL ) {
    /* process each single "word" */
}

Since the "text" is in memory, instances of each of the "words" are in memory, too. All that's needed is populating a BST with nodes that 'point to' one instance of each "word" and a counter that counts multiple occurrences of each word.

Finally, an "in order" traversal of the BST will give an alphabetised list of words and their frequency in the text.

Be sure to compartmentalize each of the functions. The "blocks" of functionality can then be re-used in other projects, and, who knows?... You may want to first load a dictionary and only report the words (and locations) that do not appear in the dictionary (typos?). The code that handles the BST "structure" (searching, adding, traversing) should be somewhat independent of what "information fields" comprise each node.

Fe2O3
  • 6,077
  • 2
  • 4
  • 20