I am attempting to enrich a dataset with zip codes from the Chicago Data Portal. The chicago crimes dataset can be found at https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-Present/ijzp-q8t2 and the geographic data for zip codes can be found at https://data.cityofchicago.org/Facilities-Geographic-Boundaries/Boundaries-ZIP-Codes/gdcf-axmw
Concerning the shape file, I downloaded the geojson, saved it to github and loaded the file into a Mage data loader as a GeoPandas Dataframe. Please see the code for the following:
- Load the Crimes dataset for 2023 as a Pandas DataFrame:
import io
import pandas as pd
import requests
from pandas import DataFrame
if 'data_loader' not in globals():
from mage_ai.data_preparation.decorators import data_loader
if 'test' not in globals():
from mage_ai.data_preparation.decorators import test
@data_loader
def load_data_from_api(**kwargs):
url='https://data.cityofchicago.org/resource/ijzp-q8t2.csv?$limit=400000'
app_token = 'You May Need Your Own APP Token'
year="2023"
headers = {
"id": "id",
"case_number": "case_number",
"date": "date",
"block":"block",
"iucr": "iucr",
"primary_type": "primary_type",
"description": "description",
"location_descrpition": "location_descrpition",
"arrest": "arrest",
"domestic": "domestic",
"beat": "beat",
"district": "district",
"ward": "ward",
"community_area": "community_area",
"fbi_code": "fbi_code",
"x_coordinate": "x_coordinate",
"y_coordinate": "y_coordinate",
"year": "year",
"updated_on": "updated_on",
"latitude": "lat",
"longitude": "lng",
"location": "location"
}
# Add the year as a query parameter in the API request
params = {
'$$app_token': app_token,
'year': year
}
response = requests.get(url, params=params)
if response.status_code == 200:
# Read the CSV data from the response
df = pd.read_csv(io.StringIO(response.text), sep=',')
df.rename(columns=headers, inplace=True)
return df
else:
# Handle error cases
print(f"Failed to retrieve data for year {year}. Status code: {response.status_code}")
return df
@test
def test_output(output, *args) -> None:
"""
Template code for testing the output of the block.
"""
assert output is not None, 'The output is undefined'
The output is a Pandas DataFrame that with a portion of the DataFrame in this picture
- Load the Chicago Zip Code geojson as a geopandas dataframe:
import json
import geopandas as gpd
import pandas as pd
import requests
from shapely.geometry import Point
if 'data_loader' not in globals():
from mage_ai.data_preparation.decorators import data_loader
if 'test' not in globals():
from mage_ai.data_preparation.decorators import test
@data_loader
def get_geojson_data(**kwargs):
"""
Template code for loading data from any source.
Returns:
Anything (e.g. data frame, dictionary, array, int, str, etc.)
"""
# Specify your data loading logic here
access_token = 'access token if you have a private github repository'
raw_file_url = "https://raw.githubusercontent.com/github_username/Chicago_Crime/main/chicago_boundaries_zipcodes.geojson"
headers = {
'Authorization': f'Bearer {access_token}',
'Accept': 'application/vnd.github.v3.raw'
}
response = requests.get(raw_file_url, headers=headers)
if response.status_code == 200:
geojson_data_str = response.text
geojson_data = json.loads(geojson_data_str)
# Convert GeoJSON to GeoDataFrame
gdf = gpd.GeoDataFrame.from_features(geojson_data['features'])
return gdf
else:
raise Exception(f"Failed to retrieve the file. Status code: {response.status_code}")
return gdf
@test
def test_output(output, *args) -> None:
"""
Template code for testing the output of the block.
"""
assert output is not None, 'The output is undefined'
The output of this geopandas dataframe is:
My goal is to return a zip code in the Chicago Crimes Pandas DataFrame. I attempted the following code but get a key error. I am not sure what is going on as I know I output the 'lng' column from the dataframe df,
- See code and traceback.
import json
import pandas as pd
import requests
from shapely.geometry import Point
import geopandas as gpd
if 'transformer' not in globals():
from mage_ai.data_preparation.decorators import transformer
if 'test' not in globals():
from mage_ai.data_preparation.decorators import test
@transformer
def transform(df, gdf, *args, **kwargs):
"""
Template code for a transformer block.
Args:
df: DataFrame from the parent block
gdf: GeoDataFrame containing geometry (polygons) and zip codes
args: Output from any additional upstream blocks (if applicable)
Returns:
DataFrame with added 'zip_code' column
"""
# Create Point geometries from latitude and longitude columns in df
geometry = [Point(xy) for xy in zip(df["lng"], df["lat"])]
# Create a GeoDataFrame from df with the geometry
df_geo = gpd.GeoDataFrame(df, geometry=geometry)
# Initialize an empty list to store the zip codes
zip_codes = []
# Iterate through each row in df_geo
for index, row in df_geo.iterrows():
point = row["geometry"]
# Iterate through each row in gdf to find if the point is within any polygon
for gdf_index, gdf_row in gdf.iterrows():
if point.within(gdf_row["geometry"]):
zip_codes.append(gdf_row["zip"])
break # Stop searching once a match is found
else:
zip_codes.append(None) # Append None if no match is found
# Add the 'zip_code' column to df_geo
df_geo["zip_code"] = zip_codes
# Return the GeoDataFrame with the 'zip_code' column
return df_geo
@test
def test_output(output, *args) -> None:
"""
Template code for testing the output of the block.
"""
assert output is not None, 'The output is undefined'
KeyError Traceback (most recent call last)
/c/data_engineering_with_python/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key)
3652 try:
-> 3653 return self._engine.get_loc(casted_key)
3654 except KeyError as err:
/c/data_engineering_with_python/lib/python3.8/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
/c/data_engineering_with_python/lib/python3.8/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'lng'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
/c/data_engineering_with_python/311_project/transformers/zip_codes_1.py in transform(df, gdf, *args, **kwargs)
27
28 # Create Point geometries from latitude and longitude columns in df
---> 29 geometry = [Point(xy) for xy in zip(df["lng"], df["lat"])]
30
31 # Create a GeoDataFrame from df with the geometry
/c/data_engineering_with_python/lib/python3.8/site-packages/geopandas/geodataframe.py in __getitem__(self, key)
1473 return a GeoDataFrame.
1474 """
-> 1475 result = super().__getitem__(key)
1476 # Custom logic to avoid waiting for pandas GH51895
1477 # result is not geometry dtype for multi-indexes
/c/data_engineering_with_python/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key)
3759 if self.columns.nlevels > 1:
3760 return self._getitem_multilevel(key)
-> 3761 indexer = self.columns.get_loc(key)
3762 if is_integer(indexer):
3763 indexer = [indexer]
/c/data_engineering_with_python/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key)
3653 return self._engine.get_loc(casted_key)
3654 except KeyError as err:
-> 3655 raise KeyError(key) from err
3656 except TypeError:
3657 # If we have a listlike key, _check_indexing_error will raise
KeyError: 'lng'
Any help would be greatly appreciated and thank you for your time.