Here's one that might help. Whenever possible, I prefer to use verbose regular expressions with embedded comments, for maintainability.
Also note the use of (?P<name>pattern)
. This helps to document the intent of the match, and also provides a useful mechanism to extract the data, if your needs go beyond simple regex validation.
import re
# Goal: '717 N 2ND ST, MANKATO, MN 56001',
# Goal: '717 N 2ND ST, MANKATO, MN, 56001',
regex = r'''
(?x) # verbose regular expression
(?i) # ignore case
(?P<HouseNumber>\d+)\s+ # Matches '717 '
(?P<Direction>[news])\s+ # Matches 'N '
(?P<StreetName>\w+)\s+ # Matches '2ND '
(?P<StreetDesignator>\w+),\s+ # Matches 'ST, '
(?P<TownName>.*),\s+ # Matches 'MANKATO, '
(?P<State>[A-Z]{2}),?\s+ # Matches 'MN ' and 'MN, '
(?P<ZIP>\d{5}) # Matches '56001'
'''
regex = re.compile(regex)
for item in (
'717 N 2ND ST, MANKATO, MN 56001',
'717 N 2ND ST, MANKATO, MN, 56001',
'717 N 2ND, Makata, 56001', # Should reject this one
'1234 N D AVE, East Boston, MA, 02134',
):
match = regex.match(item)
print item
if match:
print " House is on {Direction} side of {TownName}".format(**match.groupdict())
else:
print " invalid entry"
To make certain fields optional, we replace +
with *
, since +
means ONE-or-more, and *
means ZERO-or-more. Here is a version that matches the new requirements in the comments:
import re
# Goal: '717 N 2ND ST, MANKATO, MN 56001',
# Goal: '717 N 2ND ST, MANKATO, MN, 56001',
# Goal: '717 N 2ND ST NE, MANKATO, MN, 56001',
# Goal: '717 N 2ND, MANKATO, MN, 56001',
regex = r'''
(?x) # verbose regular expression
(?i) # ignore case
(?P<HouseNumber>\d+)\s+ # Matches '717 '
(?P<Direction>[news])\s+ # Matches 'N '
(?P<StreetName>\w+)\s* # Matches '2ND ', with optional trailing space
(?P<StreetDesignator>\w*)\s* # Optionally Matches 'ST '
(?P<StreetDirection>[news]*)\s* # Optionally Matches 'NE'
,\s+ # Force a comma after the street
(?P<TownName>.*),\s+ # Matches 'MANKATO, '
(?P<State>[A-Z]{2}),?\s+ # Matches 'MN ' and 'MN, '
(?P<ZIP>\d{5}) # Matches '56001'
'''
regex = re.compile(regex)
for item in (
'717 N 2ND ST, MANKATO, MN 56001',
'717 N 2ND ST, MANKATO, MN, 56001',
'717 N 2ND, Makata, 56001', # Should reject this one
'1234 N D AVE, East Boston, MA, 02134',
'717 N 2ND ST NE, MANKATO, MN, 56001',
'717 N 2ND, MANKATO, MN, 56001',
):
match = regex.match(item)
print item
if match:
print " House is on {Direction} side of {TownName}".format(**match.groupdict())
else:
print " invalid entry"
Next, consider the OR operator, |
, and the non-capturing group operator, (?:pattern)
. Together, they can describe complex alternatives in the input format. This version matches the new requirement that some addresses have the direction before the street name, and some have the direction after the street name, but no address has the direction in both places.
import re
# Goal: '717 N 2ND ST, MANKATO, MN 56001',
# Goal: '717 N 2ND ST, MANKATO, MN, 56001',
# Goal: '717 2ND ST NE, MANKATO, MN, 56001',
# Goal: '717 N 2ND, MANKATO, MN, 56001',
regex = r'''
(?x) # verbose regular expression
(?i) # ignore case
(?: # Matches any sort of street address
(?: # Matches '717 N 2ND ST' or '717 N 2ND'
(?P<HouseNumber>\d+)\s+ # Matches '717 '
(?P<Direction>[news])\s+ # Matches 'N '
(?P<StreetName>\w+)\s* # Matches '2ND ', with optional trailing space
(?P<StreetDesignator>\w*)\s* # Optionally Matches 'ST '
)
| # OR
(?: # Matches '717 2ND ST NE' or '717 2ND NE'
(?P<HouseNumber2>\d+)\s+ # Matches '717 '
(?P<StreetName2>\w+)\s+ # Matches '2ND '
(?P<StreetDesignator2>\w*)\s* # Optionally Matches 'ST '
(?P<Direction2>[news]+) # Matches 'NE'
)
)
,\s+ # Force a comma after the street
(?P<TownName>.*),\s+ # Matches 'MANKATO, '
(?P<State>[A-Z]{2}),?\s+ # Matches 'MN ' and 'MN, '
(?P<ZIP>\d{5}) # Matches '56001'
'''
regex = re.compile(regex)
for item in (
'717 N 2ND ST, MANKATO, MN 56001',
'717 N 2ND ST, MANKATO, MN, 56001',
'717 N 2ND, Makata, 56001', # Should reject this one
'1234 N D AVE, East Boston, MA, 02134',
'717 2ND ST NE, MANKATO, MN, 56001',
'717 N 2ND, MANKATO, MN, 56001',
):
match = regex.match(item)
print item
if match:
d = match.groupdict()
print " House is on {0} side of {1}".format(
d['Direction'] or d['Direction2'],
d['TownName'])
else:
print " invalid entry"