Airbnb New York

Análisis de la distribución y caracterísitcas de las casas y pisos de los datos de Airbnb en New York.

1 – Importamos librerias y visualizamos el dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
import seaborn as sns
df=pd.read_csv('AB_NYC_2019.csv')
df.head()

[table id=76 /]

df.dtypes
    id                                  int64
    name                               object
    host_id                             int64
    host_name                          object
    neighbourhood_group                object
    neighbourhood                      object
    latitude                          float64
    longitude                         float64
    room_type                          object
    price                               int64
    minimum_nights                      int64
    number_of_reviews                   int64
    last_review                        object
    reviews_per_month                 float64
    calculated_host_listings_count      int64
    availability_365                    int64
    dtype: object

2 – Data wrangling y Data Cleaning

– Vemos a ver cuantosdatos tenemos perdidos:

total = df.isnull().sum().sort_values(ascending=False) #total de valores perdidos por columna ordenados 

#Porcentaje de valores perdidos respecto al total de cada columna
percent = ((df.isnull().sum())*100)/df.isnull().count().sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total','Percent'], sort=False).sort_values('Total', ascending=False)
missing_data.head(40)

[table id=77 /]

– Eliminamos las columnas que consideramos poco significantes:

df.drop(['id','host_name','last_review'], axis=1, inplace=True)
df.head(5)

[table id=78 /]

– Reemplazamos los NaN de reviews_per_month por 0:

df.fillna({'reviews_per_month':0}, inplace=True)

3 – Exploracion y visualización de los datos

df.neighbourhood_group.unique()
    array(['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx'],
          dtype=object)
df.neighbourhood.unique()
    array(['Kensington', 'Midtown', 'Harlem', 'Clinton Hill', 'East Harlem',
           'Murray Hill', 'Bedford-Stuyvesant', "Hell's Kitchen",
           'Upper West Side', 'Chinatown', 'South Slope', 'West Village',
           'Williamsburg', 'Fort Greene', 'Chelsea', 'Crown Heights',
           'Park Slope', 'Windsor Terrace', 'Inwood', 'East Village',
           'Greenpoint', 'Bushwick', 'Flatbush', 'Lower East Side',
           'Prospect-Lefferts Gardens', 'Long Island City', 'Kips Bay',
           'SoHo', 'Upper East Side', 'Prospect Heights',
           'Washington Heights', 'Woodside', 'Brooklyn Heights',
           'Carroll Gardens', 'Gowanus', 'Flatlands', 'Cobble Hill',
           'Flushing', 'Boerum Hill', 'Sunnyside', 'DUMBO', 'St. George',
           'Highbridge', 'Financial District', 'Ridgewood',
           'Morningside Heights', 'Jamaica', 'Middle Village', 'NoHo',
           'Ditmars Steinway', 'Flatiron District', 'Roosevelt Island',
           'Greenwich Village', 'Little Italy', 'East Flatbush',
           'Tompkinsville', 'Astoria', 'Clason Point', 'Eastchester',
           'Kingsbridge', 'Two Bridges', 'Queens Village', 'Rockaway Beach',
           'Forest Hills', 'Nolita', 'Woodlawn', 'University Heights',
           'Gravesend', 'Gramercy', 'Allerton', 'East New York',
           'Theater District', 'Concourse Village', 'Sheepshead Bay',
           'Emerson Hill', 'Fort Hamilton', 'Bensonhurst', 'Tribeca',
           'Shore Acres', 'Sunset Park', 'Concourse', 'Elmhurst',
           'Brighton Beach', 'Jackson Heights', 'Cypress Hills', 'St. Albans',
           'Arrochar', 'Rego Park', 'Wakefield', 'Clifton', 'Bay Ridge',
           'Graniteville', 'Spuyten Duyvil', 'Stapleton', 'Briarwood',
           'Ozone Park', 'Columbia St', 'Vinegar Hill', 'Mott Haven',
           'Longwood', 'Canarsie', 'Battery Park City', 'Civic Center',
           'East Elmhurst', 'New Springville', 'Morris Heights', 'Arverne',
           'Cambria Heights', 'Tottenville', 'Mariners Harbor', 'Concord',
           'Borough Park', 'Bayside', 'Downtown Brooklyn', 'Port Morris',
           'Fieldston', 'Kew Gardens', 'Midwood', 'College Point',
           'Mount Eden', 'City Island', 'Glendale', 'Port Richmond',
           'Red Hook', 'Richmond Hill', 'Bellerose', 'Maspeth',
           'Williamsbridge', 'Soundview', 'Woodhaven', 'Woodrow',
           'Co-op City', 'Stuyvesant Town', 'Parkchester', 'North Riverdale',
           'Dyker Heights', 'Bronxdale', 'Sea Gate', 'Riverdale',
           'Kew Gardens Hills', 'Bay Terrace', 'Norwood', 'Claremont Village',
           'Whitestone', 'Fordham', 'Bayswater', 'Navy Yard', 'Brownsville',
           'Eltingville', 'Fresh Meadows', 'Mount Hope', 'Lighthouse Hill',
           'Springfield Gardens', 'Howard Beach', 'Belle Harbor',
           'Jamaica Estates', 'Van Nest', 'Morris Park', 'West Brighton',
           'Far Rockaway', 'South Ozone Park', 'Tremont', 'Corona',
           'Great Kills', 'Manhattan Beach', 'Marble Hill', 'Dongan Hills',
           'Castleton Corners', 'East Morrisania', 'Hunts Point', 'Neponsit',
           'Pelham Bay', 'Randall Manor', 'Throgs Neck', 'Todt Hill',
           'West Farms', 'Silver Lake', 'Morrisania', 'Laurelton',
           'Grymes Hill', 'Holliswood', 'Pelham Gardens', 'Belmont',
           'Rosedale', 'Edgemere', 'New Brighton', 'Midland Beach',
           'Baychester', 'Melrose', 'Bergen Beach', 'Richmondtown',
           'Howland Hook', 'Schuylerville', 'Coney Island', 'New Dorp Beach',
           "Prince's Bay", 'South Beach', 'Bath Beach', 'Jamaica Hills',
           'Oakwood', 'Castle Hill', 'Hollis', 'Douglaston', 'Huguenot',
           'Olinville', 'Edenwald', 'Grant City', 'Westerleigh',
           'Bay Terrace, Staten Island', 'Westchester Square', 'Little Neck',
           'Fort Wadsworth', 'Rosebank', 'Unionport', 'Mill Basin',
           'Arden Heights', "Bull's Head", 'New Dorp', 'Rossville',
           'Breezy Point', 'Willowbrook'], dtype=object)
df.room_type.unique()
    array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)

– Top 10 de los host con más propiedades:

top_host=df.host_id.value_counts().head(10)
top_host
    219517861    327
    107434423    232
    30283594     121
    137358866    103
    12243051      96
    16098958      96
    61391963      91
    22541573      87
    200380610     65
    7503643       52
    Name: host_id, dtype: int64
#setting figure size for future visualizations
sns.set(rc={'figure.figsize':(10,8)})
viz_1=top_host.plot(kind='bar')
viz_1.set_title('Hosts with the most listings in NYC')
viz_1.set_ylabel('Count of listings')
viz_1.set_xlabel('Host IDs')
viz_1.set_xticklabels(viz_1.get_xticklabels(), rotation=45)

Relación ente el distrito y el precio:

– Vamos a echar un rápido de cómo están distribiodos los datos:

red_square = dict(markerfacecolor='salmon', markeredgecolor='salmon', marker='.')

df.boxplot(column='price', by='neighbourhood_group', 
           flierprops=red_square, vert=False, figsize=(10,8))

plt.xlabel('\nMedian Price', fontsize=12)
plt.ylabel('District\n', fontsize=12)
plt.title('\nBoxplot: Prices by Neighbourhood\n', fontsize=14, fontweight='bold')

# get rid of automatic boxplot title
plt.suptitle('')
  • Vemos que hay bastantes outliers. Vamos a proceder a limitar el margen del precio.

– Extraemos los precios por cada distrito:

#Brooklyn
sub_1=df.loc[df['neighbourhood_group'] == 'Brooklyn']
price_sub1=sub_1[['price']]
#Manhattan
sub_2=df.loc[df['neighbourhood_group'] == 'Manhattan']
price_sub2=sub_2[['price']]
#Queens
sub_3=df.loc[df['neighbourhood_group'] == 'Queens']
price_sub3=sub_3[['price']]
#Staten Island
sub_4=df.loc[df['neighbourhood_group'] == 'Staten Island']
price_sub4=sub_4[['price']]
#Bronx
sub_5=df.loc[df['neighbourhood_group'] == 'Bronx']
price_sub5=sub_5[['price']]

#Metemos todos los df's en una lista
price_list_by_n=[price_sub1, price_sub2, price_sub3, price_sub4, price_sub5]
# Creamos una lista vacia donde se irá añadiendo el precio para cada distrito
p_l_b_n_2=[]

#Creamos una lista con los valores conocidos de los distritos
nei_list=['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx']

#loop para conocer los estadísticos por cada rango de precio que se guardará en nuestra lista vacia de arriba
for x in price_list_by_n:
    i=x.describe(percentiles=[.25, .50, .75])
    i=i.iloc[3:]
    i.reset_index(inplace=True)
    i.rename(columns={'index':'Stats'}, inplace=True)
    p_l_b_n_2.append(i)
    
#cambiar los nombres de la columna de precios al nombre del área para facilitar la lectura de la tabla   
p_l_b_n_2[0].rename(columns={'price':nei_list[0]}, inplace=True)
p_l_b_n_2[1].rename(columns={'price':nei_list[1]}, inplace=True)
p_l_b_n_2[2].rename(columns={'price':nei_list[2]}, inplace=True)
p_l_b_n_2[3].rename(columns={'price':nei_list[3]}, inplace=True)
p_l_b_n_2[4].rename(columns={'price':nei_list[4]}, inplace=True)

stat_df=p_l_b_n_2
stat_df=[df.set_index('Stats') for df in stat_df]
stat_df=stat_df[0].join(stat_df[1:])
stat_df

[table id=79 /]

  • Como hemos visto en los boxplot de arriba, hay varios outliers que nos pueden estar molestando, vamos a eliminarlos.

– Vamos a ver la densidad de la distribución de los precios usando violinplot:

sub_6=df[df.price < 500] #nos quedmoas con los que cuestan menos de 500$

viz_2=sns.violinplot(data=sub_6, x='neighbourhood_group', y='price')
viz_2.set_title('Density and distribution of prices for each neighberhood_group')

Relación de lo Barrios de los distritos frente al tipo de habitaciones

– Top 10 de barrios con propiedades:

df.neighbourhood.value_counts().head(10)
    Williamsburg          3920
    Bedford-Stuyvesant    3714
    Harlem                2658
    Bushwick              2465
    Upper West Side       1971
    Hell's Kitchen        1958
    East Village          1853
    Upper East Side       1798
    Crown Heights         1564
    Midtown               1545
    Name: neighbourhood, dtype: int64
#grabbing top 10 neighbourhoods for sub-dataframe
sub_7=df.loc[df['neighbourhood'].isin(['Williamsburg','Bedford-Stuyvesant','Harlem','Bushwick',
                 'Upper West Side','Hell\'s Kitchen','East Village','Upper East Side','Crown Heights','Midtown'])]

#using catplot to represent multiple interesting attributes together and a count
viz_3=sns.catplot(x='neighbourhood', hue='neighbourhood_group', col='room_type', data=sub_7, kind='count')
viz_3.set_xticklabels(rotation=90)

4 – Mapas

– Mapa de precios:

viz_4=sub_6.plot(kind='scatter', x='longitude', y='latitude', label='availability_365', c='price',
                  cmap=plt.get_cmap('jet'), colorbar=True, alpha=0.4, figsize=(10,8))
viz_4.legend()
import urllib

plt.figure(figsize=(10,8))

i=urllib.request.urlopen('https://upload.wikimedia.org/wikipedia/commons/e/ec/Neighbourhoods_New_York_City_Map.PNG')
nyc_img=plt.imread(i)

#scaling the image based on the latitude and longitude max and mins for proper output
plt.imshow(nyc_img,zorder=0,extent=[-74.258, -73.7, 40.49,40.92]) #extent=[izq, drcha, abajo, arriba]
ax=plt.gca()

#using scatterplot again
sub_6.plot(kind='scatter', x='longitude', y='latitude', label='availability_365', c='price', ax=ax, 
           cmap=plt.get_cmap('jet'), colorbar=True, alpha=0.4, zorder=5)
plt.legend()
plt.show()

Análisis de la columna name

– Extraemos los ‘name’ y los metemos en una lista:

#Creamos una lista vacia
_names_=[]

#tomamos los string de name y los metemos en la lista vacia
for name in df.name:
    _names_.append(name)
    
#Creamos una función que cortará esas cadenas de palabras y las meterá n array
def split_name(name):
    spl=str(name).split()
    return spl

#initializing empty list where we are going to have words counted
_names_for_count_=[]

#obtener la cadena de name de nuestra lista y usamos la función split_name, 
#luego se agrega a la lista anterior
for x in _names_:
    for word in split_name(x):
        word=word.lower() #pasamos a minusculas todo
        _names_for_count_.append(word)

– Usamos Counter para contar las palabras y determinar la más común:

from collections import Counter

_top_25_w=Counter(_names_for_count_).most_common(25)
#_top_25_w=_top_25_w[0:25]

– Metemos nuestro Top25 de palabras en un DataFrame que creamos:

sub_w=pd.DataFrame(_top_25_w)
sub_w.rename(columns={0:'Words', 1:'Count'}, inplace=True)#usamos Counter para contar las palabras y determinar la más común
from collections import Counter

_top_25_w=Counter(_names_for_count_).most_common(25)
#_top_25_w=_top_25_w[0:25]

– Visualizamos todo con un barplot:

viz_5=sns.barplot(x='Words', y='Count', data=sub_w)
viz_5.set_title('Counts of the top 25 used words for listing names')
viz_5.set_ylabel('Count of words')
viz_5.set_xlabel('Words')
viz_5.set_xticklabels(viz_5.get_xticklabels(), rotation=80)

#--------------------- Para poner las etiquetas encima de las barras-----------------------
def add_value_labels(viz_5, spacing=5):
    """Add labels to the end of each bar in a bar chart.

    Arguments:
        ax (matplotlib.axes.Axes): The matplotlib object containing the axes
            of the plot to annotate.
        spacing (int): The distance between the labels and the bars.
    """

    # For each bar: Place a label
    for rect in viz_5.patches:
        # Get X and Y placement of label from rect.
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2

        # Number of points between bar and label. Change to your liking.
        space = spacing
        # Vertical alignment for positive values
        va = 'bottom'

        # If value of bar is negative: Place label below bar
        if y_value < 0:
            # Invert space to place label below
            space *= -1
            # Vertically align label at top
            va = 'top'

        # Use Y value as label and format number with one decimal place
        label = "{:.1f}".format(y_value)

        # Create annotation
        viz_5.annotate(
            label,                      # Use `label` as label
            (x_value, y_value),         # Place label at end of the bar
            xytext=(0, space),          # Vertically shift label by `space`
            textcoords="offset points", # Interpret `xytext` as offset in points
            ha='center',                # Horizontally center label
            va=va,                      # Vertically align label differently for
            rotation=75)                # positive and negative values.
                                        


# Call the function above. All the magic happens there.
add_value_labels(viz_5)

Análisis de la columna number_of_reviews

top_reviewed_listings=df.nlargest(10,'number_of_reviews')
top_reviewed_listings

[table id=80 /]

price_avrg=top_reviewed_listings.price.mean()
print('Precio medio por noche: {}'.format(price_avrg))
    Precio medio por noche: 65.4