0

I am attempting to enrich a dataset with zip codes from the Chicago Data Portal. The chicago crimes dataset can be found at https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-Present/ijzp-q8t2 and the geographic data for zip codes can be found at https://data.cityofchicago.org/Facilities-Geographic-Boundaries/Boundaries-ZIP-Codes/gdcf-axmw

Concerning the shape file, I downloaded the geojson, saved it to github and loaded the file into a Mage data loader as a GeoPandas Dataframe. Please see the code for the following:

  1. Load the Crimes dataset for 2023 as a Pandas DataFrame:
import io
import pandas as pd
import requests
from pandas import DataFrame
if 'data_loader' not in globals():
    from mage_ai.data_preparation.decorators import data_loader
if 'test' not in globals():
    from mage_ai.data_preparation.decorators import test


@data_loader
def load_data_from_api(**kwargs):
    url='https://data.cityofchicago.org/resource/ijzp-q8t2.csv?$limit=400000'
    app_token = 'You May Need Your Own APP Token'
    year="2023"
    headers = {
        "id": "id",
        "case_number": "case_number",
        "date": "date",
        "block":"block",
        "iucr": "iucr",
        "primary_type": "primary_type",
        "description": "description",
        "location_descrpition": "location_descrpition",
        "arrest": "arrest",
        "domestic": "domestic",
        "beat": "beat",
        "district": "district",
        "ward": "ward",
        "community_area": "community_area",
        "fbi_code": "fbi_code",
        "x_coordinate": "x_coordinate",
        "y_coordinate": "y_coordinate",
        "year": "year",
        "updated_on": "updated_on",
        "latitude": "lat",
        "longitude": "lng",
        "location": "location"
        }
    # Add the year as a query parameter in the API request
    params = {
        '$$app_token': app_token,
        'year': year
    }

    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        # Read the CSV data from the response
        df = pd.read_csv(io.StringIO(response.text), sep=',')

        df.rename(columns=headers, inplace=True)

        return df
    else:
        # Handle error cases
        print(f"Failed to retrieve data for year {year}. Status code: {response.status_code}")

    return df


@test
def test_output(output, *args) -> None:
    """
    Template code for testing the output of the block.
    """
    assert output is not None, 'The output is undefined'

The output is a Pandas DataFrame that with a portion of the DataFrame in this picture

enter image description here

  1. Load the Chicago Zip Code geojson as a geopandas dataframe:
import json
import geopandas as gpd
import pandas as pd
import requests
from shapely.geometry import Point

if 'data_loader' not in globals():
    from mage_ai.data_preparation.decorators import data_loader
if 'test' not in globals():
    from mage_ai.data_preparation.decorators import test


@data_loader
def get_geojson_data(**kwargs):
    """
    Template code for loading data from any source.

    Returns:
        Anything (e.g. data frame, dictionary, array, int, str, etc.)
    """
    # Specify your data loading logic here

    access_token = 'access token if you have a private github repository'
    raw_file_url = "https://raw.githubusercontent.com/github_username/Chicago_Crime/main/chicago_boundaries_zipcodes.geojson"
    
    headers = {
    'Authorization': f'Bearer {access_token}',
    'Accept': 'application/vnd.github.v3.raw'
    }

    response = requests.get(raw_file_url, headers=headers)

    if response.status_code == 200:
        geojson_data_str = response.text
        geojson_data = json.loads(geojson_data_str)
        
        # Convert GeoJSON to GeoDataFrame
        gdf = gpd.GeoDataFrame.from_features(geojson_data['features'])
        
        return gdf
    else:
        raise Exception(f"Failed to retrieve the file. Status code: {response.status_code}")
        
    return gdf


@test
def test_output(output, *args) -> None:
    """
    Template code for testing the output of the block.
    """
    assert output is not None, 'The output is undefined'

The output of this geopandas dataframe is:

enter image description here

My goal is to return a zip code in the Chicago Crimes Pandas DataFrame. I attempted the following code but get a key error. I am not sure what is going on as I know I output the 'lng' column from the dataframe df,

  1. See code and traceback.
import json
import pandas as pd
import requests
from shapely.geometry import Point
import geopandas as gpd


if 'transformer' not in globals():
    from mage_ai.data_preparation.decorators import transformer
if 'test' not in globals():
    from mage_ai.data_preparation.decorators import test


@transformer
def transform(df, gdf, *args, **kwargs):
    """
    Template code for a transformer block.

    Args:
        df: DataFrame from the parent block
        gdf: GeoDataFrame containing geometry (polygons) and zip codes
        args: Output from any additional upstream blocks (if applicable)

    Returns:
        DataFrame with added 'zip_code' column
    """

    # Create Point geometries from latitude and longitude columns in df
    geometry = [Point(xy) for xy in zip(df["lng"], df["lat"])]

    # Create a GeoDataFrame from df with the geometry
    df_geo = gpd.GeoDataFrame(df, geometry=geometry)

    # Initialize an empty list to store the zip codes
    zip_codes = []

    # Iterate through each row in df_geo
    for index, row in df_geo.iterrows():
        point = row["geometry"]
        
        # Iterate through each row in gdf to find if the point is within any polygon
        for gdf_index, gdf_row in gdf.iterrows():
            if point.within(gdf_row["geometry"]):
                zip_codes.append(gdf_row["zip"])
                break  # Stop searching once a match is found
        else:
            zip_codes.append(None)  # Append None if no match is found

    # Add the 'zip_code' column to df_geo
    df_geo["zip_code"] = zip_codes

    # Return the GeoDataFrame with the 'zip_code' column
    return df_geo

@test
def test_output(output, *args) -> None:
    """
    Template code for testing the output of the block.
    """
    assert output is not None, 'The output is undefined'
KeyError                                  Traceback (most recent call last)

/c/data_engineering_with_python/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key)

   3652         try:

-> 3653             return self._engine.get_loc(casted_key)

   3654         except KeyError as err:


/c/data_engineering_with_python/lib/python3.8/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()


/c/data_engineering_with_python/lib/python3.8/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()


pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()


pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()


KeyError: 'lng'


The above exception was the direct cause of the following exception:


KeyError                                  Traceback (most recent call last)

/c/data_engineering_with_python/311_project/transformers/zip_codes_1.py in transform(df, gdf, *args, **kwargs)

     27 

     28     # Create Point geometries from latitude and longitude columns in df

---> 29     geometry = [Point(xy) for xy in zip(df["lng"], df["lat"])]

     30 

     31     # Create a GeoDataFrame from df with the geometry


/c/data_engineering_with_python/lib/python3.8/site-packages/geopandas/geodataframe.py in __getitem__(self, key)

   1473         return a GeoDataFrame.

   1474         """

-> 1475         result = super().__getitem__(key)

   1476         # Custom logic to avoid waiting for pandas GH51895

   1477         # result is not geometry dtype for multi-indexes


/c/data_engineering_with_python/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key)

   3759             if self.columns.nlevels > 1:

   3760                 return self._getitem_multilevel(key)

-> 3761             indexer = self.columns.get_loc(key)

   3762             if is_integer(indexer):

   3763                 indexer = [indexer]


/c/data_engineering_with_python/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key)

   3653             return self._engine.get_loc(casted_key)

   3654         except KeyError as err:

-> 3655             raise KeyError(key) from err

   3656         except TypeError:

   3657             # If we have a listlike key, _check_indexing_error will raise


KeyError: 'lng'

Any help would be greatly appreciated and thank you for your time.

Cole
  • 95
  • 1
  • 2
  • 5

0 Answers0