4

I'm looking to clean a dataset with 61k rows. I need to clean its street address column. Presently, the addresses are a nightmare. Sometimes full addresses are written out (i.e. 111 Frederick Douglass Blvd) other times the same address will be written in short hand (i.e 111 8th Ave/ 111 8th Avenue).

I'm a newer pandas/python user so i'm not sure how to tackle such massive cleaning needs. Before I've used .replace() but that seems to be the completely wrong approach. Would appreciate any of your insights.

Edit: Essentially I'm looking to standardize addresses. Addresses were inputed either incorrectly or in multiple different formats. Long term goal is to map all of these addresses but in the data's present form that's not possible

Dataset: https://data.cityofnewyork.us/City-Government/Evictions/6z8x-wfk4

ynnad
  • 73
  • 1
  • 6

1 Answers1

4

I've used this function to standardize addresses (and put the street type in shortened form, consistent with Google's formatting). Apologies for the code dump, but the regex is full of interesting things to consider.

I've also added digits for when Avenues are called by their ordinal:

def shorten_rd(address):
    '''Completes the road type. I.e. Rd becomes Road, st becomes Street as per Google etc.'''
    address = address.title()
    address = re.sub(r" Street(?=$| [NE(So|S$)(We|W$)])", ' St', address)
    address = re.sub(r" Road(?=$| [NE(So|S$)(We|W$)])", ' Rd', address)
    address = re.sub(r"(?<!The) Avenue(?=$| [NE(So|S$)(We|W$)])", ' Ave', address)
    address = re.sub(r" Close(?=$| [NE(So|S$)(We|W$)])", ' Cl', address)
    address = re.sub(r" Court(?=$| [NE(So|S$)(We|W$)])", ' Ct', address)
    address = re.sub(r"(?<!The) Crescent(?=$| [NE(So|S$)(We|W$)])", ' Cres', address)
    address = re.sub(r" Boulevarde?(?=$| [NE(So|S$)(We|W$)])", ' Blvd', address)
    address = re.sub(r" Drive(?=$| [NE(So|S$)(We|W$)])", ' Dr', address)
    address = re.sub(r" Lane(?=$| [NE(So|S$)(We|W$)])", ' Ln', address)
    address = re.sub(r" Place(?=$| [NE(So|S$)(We|W$)])", ' Pl', address)
    address = re.sub(r" Square(?=$| [NE(So|S$)(We|W$)])", ' Sq', address)
    address = re.sub(r"(?<!The) Parade(?=$| [NE(So|S$)(We|W$)])", ' Pde', address)
    address = re.sub(r" Circuit(?=$| [NE(So|S$)(We|W$)])", ' Cct', address)
    return address

def lengthen_rd(address):
    address = address.title()
    address = re.sub(r" St(?=$| [NE(So|S$)(We|W$)])", " Street", address)
    address = re.sub(r" Rd(?=$| [NE(So|S$)(We|W$)])", " Road", address)
    address = re.sub(r" Ave(?=$| [NE(So|S$)(We|W$)])", " Avenue", address)
    address = re.sub(r" Cl(?=$| [NE(So|S$)(We|W$)])", " Close", address)
    address = re.sub(r" Ct(?=$| [NE(So|S$)(We|W$)])", " Court", address)
    address = re.sub(r" Cres(?=$| [NE(So|S$)(We|W$)])", " Crescent", address)
    address = re.sub(r" Blvd(?=$| [NE(So|S$)(We|W$)])", " Boulevard", address)
    address = re.sub(r" Dr(?=$| [NE(So|S$)(We|W$)])", " Drive", address)
    address = re.sub(r" Ln(?=$| [NE(So|S$)(We|W$)])", " Lane", address)
    address = re.sub(r" Pl(?=$| [NE(So|S$)(We|W$)])", " Place", address)
    address = re.sub(r" Sq(?=$| [NE(So|S$)(We|W$)])", " Square", address)
    address = re.sub(r" Pde(?=$| [NE(So|S$)(We|W$)])", " Parade", address)
    address = re.sub(r" Cct(?=$| [NE(So|S$)(We|W$)])", " Circuit", address)
    return address

def standard_addr(address):
    '''Checks for unit numbers and street addresses and puts them in the standard format''' 
    #print("################################")
    #print("### Address: ", address)
    unit_nums = re.findall(r"(?<=Unit )\w?\d+\w?|(?<=U)\d+\w?|\w?\d+\w?(?=\s*/)", address)
    unit_num = unit_nums[0] if len(unit_nums)==1 else ""
    #print("Unit Number: ", unit_num)
    proc_addr = re.sub(r"Unit \w?\d+\w?/?|U\d+\w?/?|\w?\d+\w?\s*/", "", address)
    proc_addr = re.sub(r"^[,\- ]+|[,\- ]+$", "", proc_addr)
    #print("Unitless address: ", proc_addr)
    type_opts = r"Terrace|Way|Walk|St|Rd|Ave|Cl|Ct|Cres|Blvd|Dr|Ln|Pl|Sq|Pde|Cct"
    road_attrs_pattern = r"(?P<rd_no>\w?\d+(\-\d+)?\w?\s+)(?P<rd_nm>[a-zA-z \d\-]+)\s+(?P<rd_tp>" + type_opts + ")"
    #print("Road Attr Pattern: ", road_attrs_pattern)
    road_attrs = re.search(road_attrs_pattern, proc_addr)
    try:
        road_num = road_attrs.group('rd_no').strip()
    except AttributeError:
        road_num = ""
    #print("Road number: ", road_num)
    try:
        road_name = road_attrs.group('rd_nm').strip()
    except AttributeError:
        road_name = ""
    #print("Road name: ", road_name)
    try:
        road_type = road_attrs.group('rd_tp').strip()
    except AttributeError:
        road_type = ""
    #print("Road type: ", road_type)
    proc_addr = lengthen_rd(re.sub(r"^[,\- ]+|[,\- ]+$", "", re.sub(road_attrs_pattern, "", proc_addr)))
    #print("Leftover: ", proc_addr)

    unit_seg = (unit_num + "/" if unit_num!="" else "") if road_num != "" else ("Unit " + unit_num + ", " if unit_num!="" else "")
    road_seg = ((road_num + " " if road_num!="" else "") + road_name + " " + road_type).strip()
    post_road_seg = " " + proc_addr if proc_addr != "" else ""
    proc_addr = (unit_seg + road_seg) + post_road_seg
    #print("### Processed Address: ", proc_addr)
    return proc_addr
AER
  • 1,549
  • 19
  • 37
  • 1
    I don't know Python, but I do know Regex `(?=$| [NE(So|S$)(We|W$)])` should be saying `` because that's what _character classes_ do, even in Python. I think what you want is `(?=\b(?=N(?:[EW]|o(?:rth(?:east|west)?)?)?|S(?:[EW]|o(?:uth(?:east|west)?)?)?|E(?:a(?:st)?)?|W(?:e(?:st)?)?)$)` https://regex101.com/r/5qAu1D/1/ https://regex101.com/r/5qAu1D/2 – Regular Jo Jan 23 '21 at 16:59