# coding: utf-8 """ Created on Mon May 17 00:00:00 2017 @author: DIP """ # # Import necessary dependencies and settings # In[1]: import pandas as pd import matplotlib.pyplot as plt import matplotlib as mpl import numpy as np import scipy.stats as spstats get_ipython().magic('matplotlib inline') mpl.style.reload_library() mpl.style.use('classic') mpl.rcParams['figure.facecolor'] = (1, 1, 1, 0) mpl.rcParams['figure.figsize'] = [6.0, 4.0] mpl.rcParams['figure.dpi'] = 100 # # Raw Measures # ## Values # In[2]: poke_df = pd.read_csv('datasets/Pokemon.csv', encoding='utf-8') poke_df.head() # In[3]: poke_df[['HP', 'Attack', 'Defense']].head() # In[4]: poke_df[['HP', 'Attack', 'Defense']].describe() # ## Counts # In[5]: popsong_df = pd.read_csv('datasets/song_views.csv', encoding='utf-8') popsong_df.head(10) # # Binarization # In[6]: watched = np.array(popsong_df['listen_count']) watched[watched >= 1] = 1 popsong_df['watched'] = watched popsong_df.head(10) # In[7]: from sklearn.preprocessing import Binarizer bn = Binarizer(threshold=0.9) pd_watched = bn.transform([popsong_df['listen_count']])[0] popsong_df['pd_watched'] = pd_watched popsong_df.head(11) # # Rounding # In[8]: items_popularity = pd.read_csv('datasets/item_popularity.csv', encoding='utf-8') items_popularity # In[9]: items_popularity['popularity_scale_10'] = np.array(np.round((items_popularity['pop_percent'] * 10)), dtype='int') items_popularity['popularity_scale_100'] = np.array(np.round((items_popularity['pop_percent'] * 100)), dtype='int') items_popularity # # Interactions # In[10]: atk_def = poke_df[['Attack', 'Defense']] atk_def.head() # In[11]: from sklearn.preprocessing import PolynomialFeatures pf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False) res = pf.fit_transform(atk_def) res # In[12]: pd.DataFrame(pf.powers_, columns=['Attack_degree', 'Defense_degree']) # In[13]: intr_features = pd.DataFrame(res, columns=['Attack', 'Defense', 'Attack^2', 'Attack x Defense', 'Defense^2']) intr_features.head(5) # ## Transforming new data in the future (during predictions) # In[14]: new_df = pd.DataFrame([[95, 75],[121, 120], [77, 60]], columns=['Attack', 'Defense']) new_df # In[15]: new_res = pf.transform(new_df) new_intr_features = pd.DataFrame(new_res, columns=['Attack', 'Defense', 'Attack^2', 'Attack x Defense', 'Defense^2']) new_intr_features # # Binning # In[16]: fcc_survey_df = pd.read_csv('datasets/fcc_2016_coder_survey_subset.csv', encoding='utf-8') fcc_survey_df[['ID.x', 'EmploymentField', 'Age', 'Income']].head() # ## Fixed-width binning # ### Developer age distribution # In[17]: fig, ax = plt.subplots() fcc_survey_df['Age'].hist(color='#A9C5D3') ax.set_title('Developer Age Histogram', fontsize=12) ax.set_xlabel('Age', fontsize=12) ax.set_ylabel('Frequency', fontsize=12) # ### Binning based on rounding # # ``` # Age Range: Bin # --------------- # 0 - 9 : 0 # 10 - 19 : 1 # 20 - 29 : 2 # 30 - 39 : 3 # 40 - 49 : 4 # 50 - 59 : 5 # 60 - 69 : 6 # ... and so on # ``` # In[18]: fcc_survey_df['Age_bin_round'] = np.array(np.floor(np.array(fcc_survey_df['Age']) / 10.)) fcc_survey_df[['ID.x', 'Age', 'Age_bin_round']].iloc[1071:1076] # ### Binning based on custom ranges # # ``` # Age Range : Bin # --------------- # 0 - 15 : 1 # 16 - 30 : 2 # 31 - 45 : 3 # 46 - 60 : 4 # 61 - 75 : 5 # 75 - 100 : 6 # ``` # In[19]: bin_ranges = [0, 15, 30, 45, 60, 75, 100] bin_names = [1, 2, 3, 4, 5, 6] fcc_survey_df['Age_bin_custom_range'] = pd.cut(np.array(fcc_survey_df['Age']), bins=bin_ranges) fcc_survey_df['Age_bin_custom_label'] = pd.cut(np.array(fcc_survey_df['Age']), bins=bin_ranges, labels=bin_names) fcc_survey_df[['ID.x', 'Age', 'Age_bin_round', 'Age_bin_custom_range', 'Age_bin_custom_label']].iloc[1071:1076] # ## Quantile based binning # In[20]: fcc_survey_df[['ID.x', 'Age', 'Income']].iloc[4:9] # In[21]: fig, ax = plt.subplots() fcc_survey_df['Income'].hist(bins=30, color='#A9C5D3') ax.set_title('Developer Income Histogram', fontsize=12) ax.set_xlabel('Developer Income', fontsize=12) ax.set_ylabel('Frequency', fontsize=12) # In[22]: quantile_list = [0, .25, .5, .75, 1.] quantiles = fcc_survey_df['Income'].quantile(quantile_list) quantiles # In[23]: fig, ax = plt.subplots() fcc_survey_df['Income'].hist(bins=30, color='#A9C5D3') for quantile in quantiles: qvl = plt.axvline(quantile, color='r') ax.legend([qvl], ['Quantiles'], fontsize=10) ax.set_title('Developer Income Histogram with Quantiles', fontsize=12) ax.set_xlabel('Developer Income', fontsize=12) ax.set_ylabel('Frequency', fontsize=12) # In[24]: quantile_labels = ['0-25Q', '25-50Q', '50-75Q', '75-100Q'] fcc_survey_df['Income_quantile_range'] = pd.qcut(fcc_survey_df['Income'], q=quantile_list) fcc_survey_df['Income_quantile_label'] = pd.qcut(fcc_survey_df['Income'], q=quantile_list, labels=quantile_labels) fcc_survey_df[['ID.x', 'Age', 'Income', 'Income_quantile_range', 'Income_quantile_label']].iloc[4:9] # # Mathematical Transformations # ## Log transform # In[25]: fcc_survey_df['Income_log'] = np.log((1+ fcc_survey_df['Income'])) fcc_survey_df[['ID.x', 'Age', 'Income', 'Income_log']].iloc[4:9] # In[26]: income_log_mean = np.round(np.mean(fcc_survey_df['Income_log']), 2) fig, ax = plt.subplots() fcc_survey_df['Income_log'].hist(bins=30, color='#A9C5D3') plt.axvline(income_log_mean, color='r') ax.set_title('Developer Income Histogram after Log Transform', fontsize=12) ax.set_xlabel('Developer Income (log scale)', fontsize=12) ax.set_ylabel('Frequency', fontsize=12) ax.text(11.5, 450, r'$\mu$='+str(income_log_mean), fontsize=10) # ## Box–Cox transform # In[27]: # get optimal lambda value from non null income values income = np.array(fcc_survey_df['Income']) income_clean = income[~np.isnan(income)] l, opt_lambda = spstats.boxcox(income_clean) print('Optimal lambda value:', opt_lambda) # In[28]: fcc_survey_df['Income_boxcox_lambda_0'] = spstats.boxcox((1+fcc_survey_df['Income']), lmbda=0) fcc_survey_df['Income_boxcox_lambda_opt'] = spstats.boxcox(fcc_survey_df['Income'], lmbda=opt_lambda) fcc_survey_df[['ID.x', 'Age', 'Income', 'Income_log', 'Income_boxcox_lambda_0', 'Income_boxcox_lambda_opt']].iloc[4:9] # In[29]: income_boxcox_mean = np.round(np.mean(fcc_survey_df['Income_boxcox_lambda_opt']), 2) fig, ax = plt.subplots() fcc_survey_df['Income_boxcox_lambda_opt'].hist(bins=30, color='#A9C5D3') plt.axvline(income_boxcox_mean, color='r') ax.set_title('Developer Income Histogram after Box–Cox Transform', fontsize=12) ax.set_xlabel('Developer Income (Box–Cox transform)', fontsize=12) ax.set_ylabel('Frequency', fontsize=12) ax.text(24, 450, r'$\mu$='+str(income_boxcox_mean), fontsize=10)