I'm not sure where I'm going wrong here and why my data is returning wrong. Writing this code to use fuzzywuzzy to clean bad input road names against a list of correct names, replacing the incorrect with the closest match.
It's returning all lines of data2
back. I'm looking for it to return the same, or replaced lines of data1
back to me.
My Minimal, Reproducible Example:
import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
data1 =('3176 DETRIT ROAD')
data2 =('DETROIT RD')
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data2 = data2.split(',')
if street2 not in data2:
street2 = process.extract(street2, data2)
print(street2[0])
My full code
import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def convert_tolist(string):
li = list(string.split(" "))
return li
with open(r"Cass_Howard - Copy.csv") as csv_file,\
open("Final_Test_Clean.csv", "w") as f,\
open(r"TEST_no_dups12.csv") as ul:
csv_reader = csv.reader(csv_file, delimiter=',')
next(csv_reader)
csv_reader = csv.reader(f, delimiter=',')
file_1 = csv_file
file_2 = ul
for data1, data2 in zip(file_1, file_2):
data1 = data1.split(',')
data1 = data1[18]
data1 = data1.upper()
data2 = data2.strip()
data2 = data2.split(',')
data2 = ''.join(data2)
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data1 = list(data1)
convert_tolist(data2)
if street2 not in data2:
street2 = process.extract(street2, data2)
print(street2)
street2
query data(around 950 lines)
DETROIT ROAD
DETROIT ROAD
MANNIX ST
MANNIX ST
data2
choices data(around 200 lines)
ACRES
ADERSON RD
AIRPORT RD
ALGONQUIN