# -*- coding: utf-8 -*- """ Created on Wed May 10 19:51:47 2017 @author: Raghav Bali """ """ This script showcases data wrangling tasks ``Execute`` $ python wrangle_data.py """ import random import datetime import numpy as np import pandas as pd from random import randrange from sklearn import preprocessing def _random_date(start,date_count): """This function generates a random date based on params Args: start (date object): the base date date_count (int): number of dates to be generated Returns: list of random dates """ current = start while date_count > 0: curr = current + datetime.timedelta(days=randrange(42)) yield curr date_count-=1 def generate_sample_data(row_count=100): """This function generates a random transaction dataset Args: row_count (int): number of rows for the dataframe Returns: a pandas dataframe """ # sentinels startDate = datetime.datetime(2016, 1, 1,13) serial_number_sentinel = 1000 user_id_sentinel = 5001 product_id_sentinel = 101 price_sentinel = 2000 # base list of attributes data_dict = { 'Serial No': np.arange(row_count)+serial_number_sentinel, 'Date': np.random.permutation(pd.to_datetime([x.strftime("%d-%m-%Y") for x in _random_date(startDate, row_count)]).date ), 'User ID': np.random.permutation(np.random.randint(0, row_count, size=int(row_count/10)) + user_id_sentinel).tolist()*10, 'Product ID': np.random.permutation(np.random.randint(0, row_count, size=int(row_count/10))+ product_id_sentinel).tolist()*10 , 'Quantity Purchased': np.random.permutation(np.random.randint(1, 42, size=row_count)), 'Price': np.round(np.abs(np.random.randn(row_count)+1)*price_sentinel, decimals=2), 'User Type':np.random.permutation([chr(random.randrange(97, 97 + 3 + 1)) for i in range(row_count)]) } # introduce missing values for index in range(int(np.sqrt(row_count))): data_dict['Price'][np.argmax(data_dict['Price'] == random.choice(data_dict['Price']))] = np.nan data_dict['User Type'][np.argmax(data_dict['User Type'] == random.choice(data_dict['User Type']))] = np.nan data_dict['Date'][np.argmax(data_dict['Date'] == random.choice(data_dict['Date']))] = np.nan data_dict['Product ID'][np.argmax(data_dict['Product ID'] == random.choice(data_dict['Product ID']))] = 0 data_dict['Serial No'][np.argmax(data_dict['Serial No'] == random.choice(data_dict['Serial No']))] = -1 data_dict['User ID'][np.argmax(data_dict['User ID'] == random.choice(data_dict['User ID']))] = -101 # create data frame df = pd.DataFrame(data_dict) return df def describe_dataframe(df=pd.DataFrame()): """This function generates descriptive stats of a dataframe Args: df (dataframe): the dataframe to be analyzed Returns: None """ print("\n\n") print("*"*30) print("About the Data") print("*"*30) print("Number of rows::",df.shape[0]) print("Number of columns::",df.shape[1]) print("\n") print("Column Names::",df.columns.values.tolist()) print("\n") print("Column Data Types::\n",df.dtypes) print("\n") print("Columns with Missing Values::",df.columns[df.isnull().any()].tolist()) print("\n") print("Number of rows with Missing Values::",len(pd.isnull(df).any(1).nonzero()[0].tolist())) print("\n") print("Sample Indices with missing data::",pd.isnull(df).any(1).nonzero()[0].tolist()[0:5]) print("\n") print("General Stats::") print(df.info()) print("\n") print("Summary Stats::") print(df.describe()) print("\n") print("Dataframe Sample Rows::") print(df.head(5)) def cleanup_column_names(df,rename_dict={},do_inplace=True): """This function renames columns of a pandas dataframe It converts column names to snake case if rename_dict is not passed. Args: rename_dict (dict): keys represent old column names and values point to newer ones do_inplace (bool): flag to update existing dataframe or return a new one Returns: pandas dataframe if do_inplace is set to False, None otherwise """ if not rename_dict: return df.rename(columns={col: col.lower().replace(' ','_') for col in df.columns.values.tolist()}, inplace=do_inplace) else: return df.rename(columns=rename_dict,inplace=do_inplace) def expand_user_type(u_type): """This function maps user types to user classes Args: u_type (str): user type value Returns: (str) user_class value """ if u_type in ['a','b']: return 'new' elif u_type == 'c': return 'existing' elif u_type == 'd': return 'loyal_existing' else: return 'error' if __name__=='__main__': df = generate_sample_data(row_count=1000) describe_dataframe(df) print("\n\n") print("*"*30) print("Rename Columns") print("*"*30) cleanup_column_names(df) print(df.head()) print("\n\n") print("*"*30) print("Sorting Rows") print("*"*30) print(df.sort_values(['serial_no', 'price'], ascending=[True, False]).head()) print("\n\n") print("*"*30) print("Rearranging Columns") print("*"*30) print(df[['serial_no','date','user_id','user_type', 'product_id','quantity_purchased','price']].head()) print("\n\n") print("*"*30) print("Filtering Columns") print("*"*30) print("Using Column Index::") print(df.iloc[:,3].values) # or df[[3]].values.tolist() print("\n") print("Using Column Name::") print(df.quantity_purchased.values) # or df['quantity_purchased'].values.tolist() print("\n") print("Using Column Data Type::") print(df.select_dtypes(include=['float64']).values[:,0]) # or use exclude to remove certain datatype columns print("\n\n") print("*"*30) print("Subsetting Rows") print("*"*30) print("Select Specific row indices::") print(df.iloc[[10,501,20]]) print("\n") print("Excluding Specific Row indices::") print(df.drop([0,24,51], axis=0).head()) print("\n") print("Subsetting based on logical condition(s)::") print(df[df.quantity_purchased>25].head()) print("\n") print("Subsetting based on offset from top (bottom)::") print(df[100:].head()) print("\n") print("*"*30) print("TypeCasting/Data Type Conversion") print("*"*30) df['date'] = pd.to_datetime(df.date) print(df.dtypes) print("*"*30) print("Apply / Map") print("*"*30) df['user_class'] = df['user_type'].map(expand_user_type) # or applymap print("Create a new user_class attribute::") print(df.head()) print("\n") print("Get Attribute ranges::") print(df.select_dtypes(include=[np.number]).apply(lambda x: x.max()- x.min())) print("Get Week from date::") df['purchase_week'] = df[['date']].applymap(lambda dt:dt.week if not pd.isnull(dt.week) else 0) print(df.head()) print("\n") print("*"*30) print("Handling Missing Values") print("*"*30) print("Drop Rows with missing dates::") df_dropped = df.dropna(subset=['date']) print(df_dropped.head()) print("\n") print("Fill Missing Price values with mean price::") df_dropped['price'].fillna(value=np.round(df.price.mean(),decimals=2), inplace=True) print("Fill Missing user_type values with value from \ previous row (forward fill) ::") df_dropped['user_type'].fillna(method='ffill',inplace=True) print("Fill Missing user_type values with value from \ next row (backward fill) ::") df_dropped['user_type'].fillna(method='bfill',inplace=True) print("\n") print("Drop Duplicate serial_no rows::\n") print("Duplicate sample::") print(df_dropped[df_dropped.duplicated(subset=['serial_no'])].head()) df_dropped.drop_duplicates(subset=['serial_no'],inplace=True) print("After cleanup::") print(df_dropped.head()) print("\n") print("Remove rows which have less than 3 attributes with non-missing data::") print(df.dropna(thresh=3).head()) print("\n") print("*"*30) print("Encoding Categorical Variables") print("*"*30) print(pd.get_dummies(df,columns=['user_type']).head()) print("\n") type_map={'a':0,'b':1,'c':2,'d':3,np.NAN:-1} df['encoded_user_type'] = df.user_type.map(type_map) print((df.head())) print("\n") print("*"*30) print("Random Sampling data from DataFrame") print("*"*30) print(df.sample(frac=0.2, replace=True, random_state=42).head()) print("\n") print("*"*30) print("Normalizing Numeric Data") print("*"*30) print("Min-Max Scaler::") df_normalized = df.dropna().copy() min_max_scaler = preprocessing.MinMaxScaler() np_scaled = min_max_scaler.fit_transform(df_normalized['price'].values.reshape(-1,1)) df_normalized['price'] = np_scaled.reshape(-1,1) print(df_normalized.head()) print("\n") print("Robust Scaler::") df_normalized = df.dropna().copy() robust_scaler = preprocessing.RobustScaler() rs_scaled = robust_scaler.fit_transform(df_normalized['quantity_purchased'].values.reshape(-1,1)) df_normalized['quantity_purchased'] = rs_scaled.reshape(-1,1) print(df_normalized.head()) print("\n") print("*"*30) print("Data Summarization") print("*"*30) print("Aggregates based on condition::") print(df['price'][df['user_type']=='a'].mean()) print("\n") print("Row Counts on condition::") print(df['purchase_week'].value_counts()) print("\n") print("GroupBy attributes::") print(df.groupby(['user_class'])['quantity_purchased'].sum()) print("\n") print("GroupBy with different aggregates::") print(df.groupby(['user_class'])['quantity_purchased'].agg([np.sum, np.mean, np.count_nonzero])) print("\n") print("GroupBy with specific agg for each attribute::") print(df.groupby(['user_class','user_type']).agg({'price':np.mean, 'quantity_purchased':np.max})) print("\n") print("GroupBy with multiple agg for each attribute::") print(df.groupby(['user_class','user_type']).agg({'price':{ 'total_price':np.sum, 'mean_price':np.mean, 'variance_price':np.std, 'count':np.count_nonzero}, 'quantity_purchased':np.sum})) print("\n") print("Pivot tables::") print(df.pivot_table(index='date', columns='user_type', values='price',aggfunc=np.mean)) print("\n") print("Stacking::") print(df.stack()) print("\n")