1

I am creating a histogram which shows me the word and the frequency of each word within a text file.

I went through my previous code which worked and attempted to make it modular. This was practice for class, as we will be creating a Tweet Generator in the future. Somewhere I am doing something wrong, but I can't see what it is for the life of me.

I were to create from the plain-text file:

  1. List of Lists
  2. List of Tuples
  3. Dictionary of key value pairs

Here is what I have so far:

import re
import sys
import string

def read_file(file):
    # first_list = [] ### Unsure if I should actually keep these in here.
    # second_list = []###
    document_text = open(file, 'r')
    text_string = document_text.read().lower()
    match_pattern = re.findall(r'\b[a-z]{1, 15}\b', text_string)
    return match_pattern
#----------LIST_OF_LISTS---------------
def list_of_lists(match_pattern):
    read_file(file)
    match_pattern.sort()
    list_array = []
    count = 0
    index = None
    for word in match_pattern:
        if word == index:
            count += 1
        else:
            list_array.append([index, count])
            index = word
            count = 1
    else:

        list_array([index, count])
        list_array.pop(0)
    return str(list_array)
#------END OF LIST_OF_LISTS-------------        

#----------LIST_OF_TUPLES---------------
def list_of_tuples(match_pattern):
    read_file(file)
    frequency = {}
    first_list = []
    second_list = []
    unique_count = 0
    for word in match_pattern:
        count = frequency.get(word, 0)
        frequency[word] = count + 1
        first_list.append(word)
        if int(frequency[word]) == 1:
            unique_count += 1

    for word in match_pattern:
        second_list.append(int(frequency[word]))    

    zipped = zip(first_list, second_list)
    return list(set((zipped)))
    return str("There are " + str(unique_count) + " words in this file")
#----------END OF LIST_OF_TUPLES---------


#----------DICTIONARY FUNCTION-----------
def dictionary_histogram(match_pattern):
    dict_histo = {}
    for word in match_pattern:
        if word not in dict_histo:
            dict_histo[word] = 1
        else:
            dict_histo[word] += 1
    return str(dict_histo)

    def unique_word_dict(histogram):
        ''' Takes the histogram and returns the amount of unique words withi it.'''
        return len(histogram.keys())

    def frequency(histogram, word):
        '''takes in the histogram and a word, then returns a value of the word if the
        key exists within the dictionary, else return 0'''
        if word in histogram:
            return str(histogram[word])
        else:
            return str(0)
#------------End of Dictionary-----------------
# 
# def unique_word(histogram):
#     ''' Takes the histogram and returns the amount of unique words withi it.'''
#     return len(histogram)
# 
# def frequency(word, histogram):
#     '''takes a histogram and a word, then returns the value of the word.'''
#     return histogram[word]



if __name__ == '__main__':
    file = str(sys.argv[1])
    read_file(file)
    list_of_tuples(match_pattern)

Although, I do believe my if name == 'main': is wrong but I did try several different variations and nothing seemed to work for me.

I also tried changing some things, but this didn't work either.

import re
import sys
import string

def read_file(file):
    document_text = open(file, 'r')
    text_string = document_text.read().lower()
    # match_pattern = re.findall(r'\b[a-z]{1, 15}\b', text_string)   ### Think I should move this to the actual list function maybe??? 
    ### I originally had it return match_pattern and then I used match_pattern in my list functions i.e list_of_lists(match_pattern)
    document_text.close()
    return text_string
#----------LIST_OF_LISTS---------------
def list_of_lists(text_string):
    match_pattern = re.findall(r'\b[a-z]{1, 15}\b', text_string)
    # match_pattern.sort() #Maybe use this
    list_array = []
    count = 0
    index = None
    for word in match_pattern:
        if word == index:
            count += 1
        else:
            list_array.append([index, count])
            index = word
            count = 1
    else:

        list_array.append([index, count])
        list_array.pop(0)
    return str(list_array)
#------END OF LIST_OF_LISTS-------------        

#----------LIST_OF_TUPLES---------------
def list_of_tuples(text_string):
    match_pattern = re.findall(r'\b[a-z]{1, 15}\b', text_string)
    frequency = {}
    first_list = []
    second_list = []
    unique_count = 0
    for word in match_pattern:
        count = frequency.get(word, 0)
        frequency[word] = count + 1
        first_list.append(word)
        if int(frequency[word]) == 1:
            unique_count += 1

    for word in match_pattern:
        second_list.append(int(frequency[word]))    

    zipped = zip(first_list, second_list)
    # return list(set((zipped)))
    return str(list(set(zipped)))
    # return str("There are " + str(unique_count) + " words in this file")
#----------END OF LIST_OF_TUPLES---------


#----------DICTIONARY FUNCTION-----------
def dictionary_histogram(text_string):
    dict_histo = {}
    for word in match_pattern:
        if word not in dict_histo:
            dict_histo[word] = 1
        else:
            dict_histo[word] += 1
    return str(dict_histo)

    def unique_word_dict(histogram):
        ''' Takes the histogram and returns the amount of unique words withi it.'''
        return len(histogram.keys())

    def frequency(histogram, word):
        '''takes in the histogram and a word, then returns a value of the word if the
        key exists within the dictionary, else return 0'''
        if word in histogram:
            return str(histogram[word])
        else:
            return str(0)
#------------End of Dictionary-----------------
# 
# def unique_word(histogram):
#     ''' Takes the histogram and returns the amount of unique words withi it.'''
#     return len(histogram)
# 
# def frequency(word, histogram):
#     '''takes a histogram and a word, then returns the value of the word.'''
#     return histogram[word]

# read_file(file)
# list_of_tuples(read_file(file))
if __name__ == '__main__':
    file = str(sys.argv[1])
    # print(list_of_lists(read_file(file)))
Anwar Azeez
  • 101
  • 4

1 Answers1

0

I made 2 minor modifications of your code.

First. I replaced regex \b[a-z]{1, 15}\b with \b[a-z]+\b.

Second. I modified main suite:

if __name__ == '__main__':

    file = str(sys.argv[1])

    match_pattern = read_file(file)
    print(match_pattern)
    print()

    ans = list_of_tuples(match_pattern)
    print(ans)

Output for my sample file:

['asdf', 'asdf', 'asdf', 'sdf', 'asdf', 'asdf', 'asdfdf', 'asdfsdf', 'asdfasd', 'fas', 'dfa', 'sd', 'fass', 'dfafas', 'df', 'asdfsdf', 'asdfsdf', 'asdfdfa', 'sdf', 'asdfdf', 'asdfsdfas', 'dfasdf', 'asdfdfasdf', 'asdffas', 'dfasdffas', 'dfs', 'fas', 'sdf', 'asdfd', 'asdfsd', 'asfd', 'as', 'dfdfa', 'sddf', 'asd', 'fasdf', 'asdf', 'assdf', 'asdf', 'asdf', 'das', 'assdffa', 'sdf', 'asdf', 'asdf', 'assdf', 'asd', 'asd', 'asfdd', 'fasasdf', 'asdf', 'assdf', 'asdf', 'assd']

[('asdfsdfas', 1), ('dfafas', 1), ('dfasdffas', 1), ('asdf', 12), ('as', 1), ('dfasdf', 1), ('fasdf', 1), ('assd', 1), ('assdf', 3), ('dfs', 1), ('asdfdf', 2), ('asd', 3), ('df', 1), ('dfdfa', 1), ('fasasdf', 1), ('asdfsd', 1), ('asfd', 1), ('das', 1), ('asfdd', 1), ('asdffas', 1), ('sdf', 4), ('sddf', 1), ('dfa', 1), ('asdfdfasdf', 1), ('asdfsdf', 3), ('assdffa', 1), ('asdfd', 1), ('asdfasd', 1), ('sd', 1), ('fas', 2), ('asdfdfa', 1), ('fass', 1)]

So the program runs and the output looks like some legit result.