[This work is based on this course: IBM Data Science Professional Certificate.]
Our objective is to make a summary of the type of restaurant that a tourist or a resident can find in each neighborhood of Berlin. Where to eat American food, German food,…
1 – Import Libraries
from IPython.display import Image from IPython.core.display import HTML import pickle import requests import folium import pandas as pd import numpy as np # library to handle data in a vectorized manner import shapely.geometry from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values import pyproj import math import matplotlib.pyplot as plt import json from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe # Matplotlib and associated plotting modules import matplotlib.cm as cm import matplotlib.colors as colors # import k-means from clustering stage from sklearn.cluster import KMeans from bs4 import BeautifulSoup
2 – Extract Berlin Boroughs info
url = 'https://en.wikipedia.org/wiki/Boroughs_and_neighborhoods_of_Berlin' source = requests.get(url).text soup = BeautifulSoup(source) table_data = soup.find('div', class_='mw-parser-output') table = table_data.table.tbody columns = ['Borough', 'Population', 'Area', 'Density'] data = dict({key:[]*len(columns) for key in columns}) for row in table.find_all('tr'): for i,column in zip(row.find_all('td'),columns): i = i.text i = i.replace('\n', '') data[column].append(i) df = pd.DataFrame.from_dict(data=data)[columns] df
[table id=102 /]
We need to change Lichtenberg’s Borough name beacause there is another Lichtenberg village in Germany and it can be confused to us:
df.loc[df['Borough'] == 'Lichtenberg', 'Borough'] = 'Lichtenberg Berlin' df
[table id=103 /]
We add the coordinates for each Borough
geolocator = Nominatim(user_agent="Berlin_food") df['Major_Dist_Coord']= df['Borough'].apply(geolocator.geocode).apply(lambda x: (x.latitude, x.longitude)) df[['Latitude', 'Longitude']] = df['Major_Dist_Coord'].apply(pd.Series) df.drop(['Major_Dist_Coord'], axis=1, inplace=True) df
[table id=104 /]
3 – Building clusters of the neighborhoods in Berlin
3.1 – Geographical coordinates of Berlin
address = 'Berlin, Germany' geolocator = Nominatim(user_agent="Berlin_food") location = geolocator.geocode(address) latitude = location.latitude longitude = location.longitude print('Coordinate of {}: {}, {}'.format(address, latitude, longitude))
Coordinate of Berlin, Germany: 52.5170365, 13.3888599
3.2 – Visualization of Berlin’s neighborhoods
# create map map_berlin = folium.Map(location=[latitude, longitude], zoom_start=11) # add markers for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Borough']): label = folium.Popup(label, parse_html=True) folium.CircleMarker( [lat, lng], radius=5, popup=label, color='red', fill=True, fill_color='#3186cc', fill_opacity=0.7, parse_html=False).add_to(map_berlin) map_berlin

3.3 – Using Foursquare API, I will explore the neighborhoods of Berlin
CLIENT_ID = '**************' # Here goes your Foursquare ID CLIENT_SECRET = '***************' ACCESS_TOKEN = '************' # Here goes your FourSquare Access Token VERSION = '20210505' LIMIT = 50
3.4 – Show venues in Mitte within a radius of 3500 meters.
neighborhood_latitude = df.loc[4, 'Latitude'] # neighborhood latitude value neighborhood_longitude = df.loc[4, 'Longitude'] # neighborhood longitude value neighborhood_name = df.loc[4, 'Borough'] # neighborhood name print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, neighborhood_latitude, neighborhood_longitude))
Latitude and longitude values of Mitte are 52.5178855, 13.4040601.
LIMIT = 30 radius = 3500 url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format( CLIENT_ID, CLIENT_SECRET, VERSION, neighborhood_latitude, neighborhood_longitude, radius, LIMIT) results = requests.get(url).json()
# function that extracts the category of the venue def get_category_type(row): try: categories_list = row['categories'] except: categories_list = row['venue.categories'] if len(categories_list) == 0: return None else: return categories_list[0]['name']
venues = results['response']['groups'][0]['items'] nearby_venues = json_normalize(venues) # flatten JSON # filtering columns filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng'] nearby_venues =nearby_venues.loc[:, filtered_columns] # filtering the category for each row nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1) # cleaning columns nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns] nearby_venues
[table id=105 /]
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))
30 venues were returned by Foursquare.
print ('{} unique categories in {}'.format(nearby_venues['categories'].value_counts().shape[0],neighborhood_name))
24 unique categories in Mitte
print (nearby_venues['categories'].value_counts()[0:20])
Bookstore 5 Concert Hall 2 Park 2 Monument / Landmark 1 Chocolate Shop 1 Hotel 1 Bistro 1 Café 1 Cosmetics Shop 1 Sandwich Place 1 Poke Place 1 Wine Bar 1 Ice Cream Shop 1 Bike Rental / Bike Share 1 Caucasian Restaurant 1 Coffee Shop 1 Garden 1 Exhibit 1 Gourmet Shop 1 Indie Movie Theater 1 Name: categories, dtype: int64
3.5 – Exploration of the neighbourhoods in Berlin
def getNearbyVenues(names, latitudes, longitudes, radius=3500, LIMIT=1000): venues_list=[] for name, lat, lng in zip(names, latitudes, longitudes): print(name) # create the API request URL url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format( CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT) # make the GET request results = requests.get(url).json()["response"]['groups'][0]['items'] # return only relevant information for each nearby venue venues_list.append([( name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results]) nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list]) nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category'] return(nearby_venues)
berlin_venues = getNearbyVenues(names=df['Borough'], latitudes=df['Latitude'], longitudes=df['Longitude'] )
Charlottenburg-Wilmersdorf Friedrichshain-Kreuzberg Lichtenberg Berlin Marzahn-Hellersdorf Mitte Neukölln Pankow Reinickendorf Spandau Steglitz-Zehlendorf Tempelhof-Schöneberg Treptow-Köpenick
berlin_venues.shape
(1081, 7)
Creating a dataframe out of it to concentrate only on Restaurants
berlin_Venues_only_restaurant = berlin_venues[berlin_venues['Venue Category'].str.contains('Restaurant')].reset_index(drop=True) berlin_Venues_only_restaurant.index = np.arange(1, len(berlin_Venues_only_restaurant) + 1) print (berlin_Venues_only_restaurant['Venue Category'].value_counts())
Italian Restaurant 40 German Restaurant 25 Greek Restaurant 14 Fast Food Restaurant 11 Restaurant 11 Falafel Restaurant 10 Vietnamese Restaurant 8 Asian Restaurant 8 Thai Restaurant 7 Doner Restaurant 6 Chinese Restaurant 6 Middle Eastern Restaurant 5 Argentinian Restaurant 5 Mexican Restaurant 5 Indian Restaurant 5 Sushi Restaurant 4 Vegetarian / Vegan Restaurant 4 Turkish Restaurant 4 Seafood Restaurant 4 Mediterranean Restaurant 3 Spanish Restaurant 3 French Restaurant 3 Dumpling Restaurant 3 African Restaurant 3 Eastern European Restaurant 3 Korean Restaurant 3 Japanese Restaurant 2 Ramen Restaurant 2 Lebanese Restaurant 2 Caucasian Restaurant 1 Syrian Restaurant 1 Halal Restaurant 1 New American Restaurant 1 Moroccan Restaurant 1 Name: Venue Category, dtype: int64
print('There are {} unique categories.'.format(len(berlin_Venues_only_restaurant['Venue Category'].unique())))
There are 34 unique categories.
Creating a dataframe of top 10 categories
berlin_Venues_Top10 = berlin_Venues_only_restaurant['Venue Category'].value_counts()[0:10].to_frame(name='frequency') berlin_Venues_Top10 = berlin_Venues_Top10.reset_index() berlin_Venues_Top10.rename(index=str, columns={"index": "Venue_Category", "frequency": "Frequency"}, inplace=True) berlin_Venues_Top10
[table id=106 /]
import seaborn as sns from matplotlib import pyplot as plt s=sns.barplot(x="Venue_Category", y="Frequency", data=berlin_Venues_Top10) s.set_xticklabels(s.get_xticklabels(), rotation=45, horizontalalignment='right') plt.title('10 Most Frequently Venues in 12 Boroughs of Berlin', fontsize=15) plt.xlabel("Venue Category", fontsize=15) plt.ylabel ("Frequency", fontsize=15) fig = plt.figure(figsize=(18,15)) plt.show()

3.6 – Size of the dataframe
print ("Shape with only Restaurant: ", berlin_Venues_only_restaurant.shape) berlin_Venues_only_restaurant.head(10)
Shape with only Restaurant: (214, 7)
[table id=107 /]
3.7 – Analysis of the neighbourhoods
berlin_Venues_restaurant = berlin_Venues_only_restaurant.groupby(['Neighborhood'])['Venue Category'].apply(lambda x: x[x.str.contains('Restaurant')].count()) berlin_Venues_restaurant
Neighborhood Charlottenburg-Wilmersdorf 26 Friedrichshain-Kreuzberg 21 Lichtenberg Berlin 10 Marzahn-Hellersdorf 11 Mitte 6 Neukölln 21 Pankow 12 Reinickendorf 33 Spandau 24 Steglitz-Zehlendorf 23 Tempelhof-Schöneberg 21 Treptow-Köpenick 6 Name: Venue Category, dtype: int64
berlin_Venues_restaurant_df = berlin_Venues_restaurant.to_frame().reset_index() berlin_Venues_restaurant_df.columns = ['Neighborhood', 'Number of Restaurant'] berlin_Venues_restaurant_df.index = np.arange(1, len(berlin_Venues_restaurant_df) +1 ) list_rest_no = berlin_Venues_restaurant_df['Number of Restaurant'].to_list() list_dist = berlin_Venues_restaurant_df['Neighborhood'].to_list()
One-Hot Encoding
berlin_onehot = pd.get_dummies(berlin_Venues_only_restaurant[['Venue Category']], prefix="", prefix_sep="") # add neighborhood column back to dataframe berlin_onehot['Neighborhood'] = berlin_Venues_only_restaurant['Neighborhood'] # move neighborhood column to the first column fixed_columns = [berlin_onehot.columns[-1]] + list(berlin_onehot.columns[:-1]) berlin_onehot = berlin_onehot[fixed_columns] berlin_onehot.head()
[table id=108 /]
3.8 – Grouping by neighbourhoods and showing the mean of the frequency of occurrence for each category of restaurants.
berlin_grouped = berlin_onehot.groupby('Neighborhood').mean().reset_index() berlin_grouped
[table id=109 /]
3.9 – Print the neighbourhoods with their respective top 10 most common venues.
num_top_venues = 10 for nb in berlin_grouped['Neighborhood']: print("**** " + nb + " ****") temp = berlin_grouped[berlin_grouped['Neighborhood'] == nb].T.reset_index() temp.columns = ['venue','frequency'] temp = temp.iloc[1:] temp['frequency'] = temp['frequency'].astype(float) temp = temp.round({'frequency': 2}) print(temp.sort_values('frequency', ascending=False).reset_index(drop=True).head(num_top_venues)) print('\n')
**** Charlottenburg-Wilmersdorf **** venue frequency 0 Italian Restaurant 0.31 1 German Restaurant 0.19 2 Vietnamese Restaurant 0.12 3 Asian Restaurant 0.08 4 Greek Restaurant 0.04 5 Indian Restaurant 0.04 6 Mediterranean Restaurant 0.04 7 Falafel Restaurant 0.04 8 Argentinian Restaurant 0.04 9 French Restaurant 0.04 **** Friedrichshain-Kreuzberg **** venue frequency 0 Falafel Restaurant 0.29 1 Middle Eastern Restaurant 0.10 2 Thai Restaurant 0.10 3 African Restaurant 0.05 4 Mediterranean Restaurant 0.05 5 Lebanese Restaurant 0.05 6 Italian Restaurant 0.05 7 Spanish Restaurant 0.05 8 German Restaurant 0.05 9 French Restaurant 0.05 **** Lichtenberg Berlin **** venue frequency 0 Vietnamese Restaurant 0.2 1 Italian Restaurant 0.2 2 Greek Restaurant 0.2 3 German Restaurant 0.2 4 Syrian Restaurant 0.1 5 Indian Restaurant 0.1 6 Ramen Restaurant 0.0 7 Middle Eastern Restaurant 0.0 8 Moroccan Restaurant 0.0 9 New American Restaurant 0.0 **** Marzahn-Hellersdorf **** venue frequency 0 Italian Restaurant 0.36 1 Fast Food Restaurant 0.18 2 Greek Restaurant 0.18 3 Restaurant 0.09 4 Mexican Restaurant 0.09 5 Asian Restaurant 0.09 6 Thai Restaurant 0.00 7 Syrian Restaurant 0.00 8 Sushi Restaurant 0.00 9 Spanish Restaurant 0.00 **** Mitte **** venue frequency 0 Seafood Restaurant 0.17 1 Vegetarian / Vegan Restaurant 0.17 2 Caucasian Restaurant 0.17 3 Italian Restaurant 0.17 4 Middle Eastern Restaurant 0.17 5 Ramen Restaurant 0.17 6 Restaurant 0.00 7 Mexican Restaurant 0.00 8 Moroccan Restaurant 0.00 9 New American Restaurant 0.00 **** Neukölln **** venue frequency 0 African Restaurant 0.10 1 Falafel Restaurant 0.10 2 Vegetarian / Vegan Restaurant 0.10 3 Turkish Restaurant 0.10 4 Spanish Restaurant 0.10 5 Restaurant 0.10 6 Korean Restaurant 0.10 7 Dumpling Restaurant 0.10 8 Vietnamese Restaurant 0.05 9 Sushi Restaurant 0.05 **** Pankow **** venue frequency 0 Greek Restaurant 0.25 1 Italian Restaurant 0.17 2 Restaurant 0.08 3 Asian Restaurant 0.08 4 Chinese Restaurant 0.08 5 Doner Restaurant 0.08 6 Thai Restaurant 0.08 7 Mexican Restaurant 0.08 8 German Restaurant 0.08 9 African Restaurant 0.00 **** Reinickendorf **** venue frequency 0 Italian Restaurant 0.18 1 German Restaurant 0.15 2 Restaurant 0.09 3 Indian Restaurant 0.06 4 Seafood Restaurant 0.06 5 Greek Restaurant 0.06 6 Eastern European Restaurant 0.06 7 Argentinian Restaurant 0.06 8 Sushi Restaurant 0.03 9 New American Restaurant 0.03 **** Spandau **** venue frequency 0 Italian Restaurant 0.21 1 German Restaurant 0.17 2 Fast Food Restaurant 0.12 3 Argentinian Restaurant 0.08 4 Turkish Restaurant 0.08 5 Restaurant 0.08 6 Vietnamese Restaurant 0.04 7 Halal Restaurant 0.04 8 Greek Restaurant 0.04 9 Mexican Restaurant 0.04 **** Steglitz-Zehlendorf **** venue frequency 0 Italian Restaurant 0.30 1 German Restaurant 0.22 2 Asian Restaurant 0.09 3 Doner Restaurant 0.09 4 French Restaurant 0.04 5 Fast Food Restaurant 0.04 6 Restaurant 0.04 7 Greek Restaurant 0.04 8 Mexican Restaurant 0.04 9 Sushi Restaurant 0.04 **** Tempelhof-Schöneberg **** venue frequency 0 Italian Restaurant 0.19 1 Asian Restaurant 0.10 2 Thai Restaurant 0.10 3 Chinese Restaurant 0.10 4 Doner Restaurant 0.10 5 Fast Food Restaurant 0.10 6 Korean Restaurant 0.05 7 Greek Restaurant 0.05 8 Restaurant 0.05 9 Middle Eastern Restaurant 0.05 **** Treptow-Köpenick **** venue frequency 0 German Restaurant 0.33 1 Fast Food Restaurant 0.17 2 Sushi Restaurant 0.17 3 Greek Restaurant 0.17 4 Seafood Restaurant 0.17 5 Restaurant 0.00 6 Middle Eastern Restaurant 0.00 7 Moroccan Restaurant 0.00 8 New American Restaurant 0.00 9 Ramen Restaurant 0.00
3.10 – Creating a pandas dataframe.
def return_most_common_venues(row, num_top_venues): row_categories = row.iloc[1:] row_categories_sorted = row_categories.sort_values(ascending=False) return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues = 10 indicators = ['st', 'nd', 'rd'] # create columns according to number of top venues columns = ['Neighborhood'] for ind in np.arange(num_top_venues): try: columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind])) except: columns.append('{}th Most Common Venue'.format(ind+1)) # create a new dataframe neighborhoods_venues_sorted = pd.DataFrame(columns=columns) neighborhoods_venues_sorted['Neighborhood'] = berlin_grouped['Neighborhood'] for ind in np.arange(berlin_grouped.shape[0]): neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(berlin_grouped.iloc[ind, :], num_top_venues) neighborhoods_venues_sorted.head(23)
[table id=110 /]
3.11 – Clustering the neighbourhoods with k-means.
# set number of clusters kclusters = 5 berlin_grouped_clustering = berlin_grouped.drop('Neighborhood', 1) # run k-means clustering kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(berlin_grouped_clustering) # check cluster labels generated for each row in the dataframe kmeans.labels_[0:10]
array([1, 2, 1, 4, 3, 2, 4, 1, 1, 1], dtype=int32)
berlin_merged = df berlin_merged.head(10)
[table id=111 /]
neighborhoods_venues_sorted.head()
[table id=112 /]
neighborhoods_venues_sorted_w_clusters = neighborhoods_venues_sorted neighborhoods_venues_sorted_w_clusters.head()
[table id=113 /]
Add clustering labels
neighborhoods_venues_sorted_w_clusters.insert(0, 'Nº Cluster', kmeans.labels_) neighborhoods_venues_sorted_w_clusters.head()
[table id=114 /]
berlin_merged.rename(columns={'City district':'Neighborhood'}, inplace=True) berlin_merged = berlin_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Borough') berlin_merged.head()
[table id=115 /]
Finally, let’s visualize the resulting clusters.
# create a map with folium map_restaurants_10 = folium.Map(location=[latitude,longitude], tiles='cartodbpositron', attr="<a href=https://github.com/python-visualization/folium/>Folium</a>") # set color scheme for the five clusters x = np.arange(kclusters) ys = [i + x + (i*x)**2 for i in range(kclusters)] colors_array = cm.rainbow(np.linspace(0, 1, len(ys))) rainbow = [colors.rgb2hex(i) for i in colors_array] # add markers to the map for lat, lon, poi, cluster in zip(berlin_merged['Latitude'], berlin_merged['Longitude'], berlin_merged['Borough'], berlin_merged['Nº Cluster']): label = folium.Popup(str(poi) + ' Nº Cluster ' + str(cluster), parse_html=True) folium.CircleMarker( [lat, lon], radius=list_rest_no[list_dist.index(poi)]*0.5, popup=label, color=rainbow[cluster-1], fill=True, fill_color=rainbow[cluster-1], fill_opacity=0.7).add_to(map_restaurants_10) map_restaurants_10

3.12 – Examination of the 5 clusters.
Now, we can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, we can then assign a name to each cluster.
Cluster 0
berlin_merged.loc[berlin_merged['Nº Cluster'] == 0, berlin_merged.columns[[1] + list(range(5, berlin_merged.shape[1]))]]
[table id=116 /]
- Cluster 0 could be called the German cluster.
Cluster 1
berlin_merged.loc[berlin_merged['Nº Cluster'] == 1, berlin_merged.columns[[1] + list(range(5, berlin_merged.shape[1]))]]
[table id=117 /]
- Cluster 1 could be called the Italian cluster.
Cluster 2
berlin_merged.loc[berlin_merged['Nº Cluster'] == 2, berlin_merged.columns[[1] + list(range(5, berlin_merged.shape[1]))]]
[table id=118 /]
- Cluster 2 could be called the Middle Eastern food and African cluster.
Cluster 3
berlin_merged.loc[berlin_merged['Nº Cluster'] == 3, berlin_merged.columns[[1] + list(range(5, berlin_merged.shape[1]))]]
[table id=119 /]
- Cluster 3 could be called the Seafood cluster.
Cluster 4
berlin_merged.loc[berlin_merged['Nº Cluster'] == 4, berlin_merged.columns[[1] + list(range(5, berlin_merged.shape[1]))]]
[table id=120 /]
- Cluster 4 could be called the Italian cluster.