I keep having an issue with my code i'm not sure what else I could do. I want to remove all variants from the product titles. some of them are being removed and some are not. Examples of what is not is being removed is oz,ml, mg and alot of words that are within the new_words_filtered csv file. Im not sure of what other approaches I could take to complete this task, I was thinking regex but I don't know all the patterns of the product names come in or maybe using fuzzy matching to find the longest matching string.
Just a little about the data,15,000+ they are all product titles that include variants like color, size , packaging etc. They are different lengths, formats and some have incorrect spelling and spacing.
I'm hoping someone could take a look at my code and maybe show me what im doing wrong or have any other approaches to solve this problem.
`import pandas as pd
import time
#file_name= 'new_london.csv'
file_name= 'london.csv'
words_filtered = 'new_words_filtered.csv'
colors = 'more_colors.csv'
df = pd.read_csv(file_name, header=None,
names=range(150))
colors_df = pd.read_csv(colors)
words_filtered_df = pd.read_csv(words_filtered)
def filter_lists(x):
x = str(x).strip()
x = " ".join(x.split())
if x.endswith('/'):
x = x[:-1].strip()
if x.endswith('.'):
x = x[:-1].strip()
if x.endswith('/'):
x = x[:-1].strip()
x = x.strip()
if len(x) < 2:
return ''
return x.lower()
colors_df = colors_df.applymap(filter_lists)
colors_df.drop_duplicates(inplace=True)
colors_df.dropna(inplace=True)
colors= list(set([ str(i[0]) for i in
colors_df.values.tolist()]))
colors.append('vanilla')
words_filtered_df.dropna(axis=1, how='all', inplace=True)
words_filtered_df =
words_filtered_df.applymap(filter_lists)
words_filtered_df.drop_duplicates(inplace=True)
words_filtered = set([ str(i[0]) for i in
words_filtered_df.values.tolist()])
words_filtered.remove('')
words_filtered = list(words_filtered)
df.columns = df.iloc[0]
df = df.drop(df.index[[0]])
df.fillna('', inplace=True)
d = df['name']
def filter_data_new(x):
x = x.lower().strip()
x = " ".join(x.split())
x = x.strip()
if x.endswith('.'):
x = x[:-1]
x = x.strip()
if x.endswith('/'):
x = x[:-1]
x = x.strip()
if x.endswith('.'):
x = x[:-1]
x = x.strip()
for i in colors:
if x.endswith(i):
l = len(i)
x = x[:-l]
x = x.strip()
x = x.strip().split('-')
x = "-".join([i.strip() for i in x if len(i.strip())])
for i in words_filtered:
if x.endswith(i):
x = x.strip()
l = len(i)
x = x[:-l]
x = x.strip()
break
x = x.strip().split('-')
x = "-".join([i for i in x if len(i.strip())])
for i in words_filtered:
if x.endswith(i):
x = x.strip()
l = len(i)
x = x[:-l]
x = x.strip()
break
x = x.strip().split('-')
x = " -".join([i for i in x if len(i.strip())])
if x.endswith('oz') or x.endswith('ml') :
x = x[:-2]
x = x.strip().split()
x = " ".join(x[:-1])
if x.endswith('jar'):
x = x[:-3]
x = x.strip().split()
x = " ".join(x[:-1])
return x.strip()
y = d.map(filter_data_new)
df['name'] = y
df.to_csv('london_new'+str(time.time()).replace('.','_')+'.csv', index=False)