import pandas as pd
import geopandas as gpd
import numpy as np
import json
import folium
from folium.plugins import MarkerCluster
# Load the film permits data
film_permits = pd.read_csv('Dataset/nyc_film_permits.csv')
# Load the borough boundaries data
with open('Dataset/nyc_borough_boundaries.geojson') as f:
borough_boundaries_data = json.load(f)
borough_boundaries = gpd.GeoDataFrame.from_features(borough_boundaries_data)
# Load the postal codes data
postal_codes = pd.read_csv('Dataset/us_postalcode.csv', usecols=['zip', 'lat', 'lng'])
# Rename the columns to match the column names used before
postal_codes = postal_codes.rename(columns={'zip': 'PostalCode', 'lat': 'Latitude', 'lng': 'Longitude'})
postal_codes.head()
# print(film_permits['Borough'].unique())
# print(borough_boundaries['boro_name'].unique())
print(film_permits['ZipCode(s)'].unique())
print(postal_codes['PostalCode'].unique())
['11222' '11237, 11385' '11206' ... '11356, 11357, 11420' '10002, 10012, 11231' '10002, 10017, 10036, 10173'] [ 601 602 603 ... 99926 99927 99929]
# Keep only necessary columns
film_permits = film_permits[['Borough', 'ZipCode(s)']]
# Convert 'ZipCode(s)' to string type
film_permits['ZipCode(s)'] = film_permits['ZipCode(s)'].astype(str)
# Handle rows with multiple zip codes (split on comma, strip whitespace, and expand into separate rows)
film_permits = film_permits.assign(**{'ZipCode(s)': film_permits['ZipCode(s)'].str.split(',').apply(lambda x: [i.strip() for i in x])}).explode('ZipCode(s)')
film_permits = film_permits[film_permits['ZipCode(s)'].str.len() == 5]
# # Convert 'nan' strings to actual NaN values
# film_permits['ZipCode(s)'] = film_permits['ZipCode(s)'].replace('nan', np.nan)
# # Drop rows with 'nan' or empty values in 'ZipCode(s)'
# film_permits = film_permits[film_permits['ZipCode(s)'].notna()]
film_permits['ZipCode(s)'] = film_permits['ZipCode(s)'].astype(int)
print(film_permits['ZipCode(s)'].unique())
[11222 11237 11385 11206 10007 10013 10282 10014 11105 10312 10012 10011 10002 10016 10018 10038 10029 10019 10020 11378 11223 10463 10473 10028 10128 11101 11109 11211 10003 10024 10023 11201 11207 10022 10152 11217 11218 10112 10001 10121 10031 10032 10025 10026 11226 11379 10036 10119 10457 10458 10471 11205 10027 10035 11215 10106 10010 10017 11231 10021 10065 11106 10030 10004 10006 11203 11104 11377 11224 11375 10462 11365 11366 10075 11249 11368 11373 10454 11220 11232 11357 11359 11430 11434 11436 10466 11418 10309 11356 10044 10105 10173 11216 10005 11367 11374 11102 11230 11372 11432 11213 10278 11234 11209 11361 83 10470 11225 10033 11221 11208 11239 11362 11363 10045 11251 10171 11236 11238 10305 10308 10009 11235 10118 10165 10103 11435 11103 11354 11415 11419 11420 10041 11229 10474 10452 10037 10467 10170 10451 11214 11228 10465 10301 11233 10306 10281 11691 10461 10034 11416 11212 11210 10456 10304 10115 10040 10455 10459 10280 10111 11355 10472 10310 11693 11694 10162 10279 10154 10158 11421 11358 11364 11427 10172 10271 10464 10069 11692 10303 10155 10167 10177 10048 10039 10153 11417 10097 10460 11433 10302 11360 11204 11414 10110 10175 10176 11004 11040 10314 10120 11038 11370 11219 10453 10468 10307 11695 11369 11371 10000 10469 11697 10475 11423]
# Keep only rows with postal codes in film permits data
postal_codes = postal_codes[postal_codes['PostalCode'].isin(film_permits['ZipCode(s)'].unique())]
# Keep only necessary columns
postal_codes = postal_codes[['PostalCode', 'Latitude', 'Longitude']]
postal_codes.head(3)
PostalCode | Latitude | Longitude | |
---|---|---|---|
2583 | 10001 | 40.75064 | -73.99728 |
2584 | 10002 | 40.71597 | -73.98692 |
2585 | 10003 | 40.73184 | -73.98915 |
# Merge data
film_permits = film_permits.merge(postal_codes, left_on='ZipCode(s)', right_on='PostalCode')
film_permits = film_permits.merge(borough_boundaries, left_on='Borough', right_on='boro_name')
film_permits.head(2)
Borough | ZipCode(s) | PostalCode | Latitude | Longitude | geometry | boro_code | boro_name | shape_area | shape_leng | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Brooklyn | 11222 | 11222 | 40.72862 | -73.94764 | MULTIPOLYGON (((-73.86327 40.58388, -73.86381 ... | 3 | Brooklyn | 1934138258.43 | 728148.53241 |
1 | Brooklyn | 11222 | 11222 | 40.72862 | -73.94764 | MULTIPOLYGON (((-73.86327 40.58388, -73.86381 ... | 3 | Brooklyn | 1934138258.43 | 728148.53241 |
import matplotlib.pyplot as plt
# Extract the longitude and latitude values
longitude_values = film_permits['Longitude']
latitude_values = film_permits['Latitude']
# Define the size and the bins for the histogram
heatmap, xedges, yedges = np.histogram2d(longitude_values, latitude_values, bins=[100,100])
# Plot the heatmap
plt.imshow(heatmap.T, origin='lower')
plt.show()
# Initialize the map
m = folium.Map(location=[40.7128, -74.0060], zoom_start=11) # coordinates for NYC
# Add film permits locations to the map
for idx, row in film_permits.iterrows():
folium.CircleMarker([row['Latitude'], row['Longitude']], radius=1).add_to(m)
# Display the map
m