Contents
My class website at Texas State: yiclass.wp.txstate.edu
MY CHEAT SHEETS for Python coding
from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all"
!pip install pandas==2.2.1 # restart the session
!pip install pyarrow
help(sum) import warnings warnings.filterwarnings('ignore') import pandas as pd pd.options.mode.chained_assignment = None # default='warn'
# Jupyter uses forward slashes to access folders in path. For example.,
path = 'C:/Users/hy11/Documents/......../data/facebook.csv'
# or path = 'C:\\Users\\hy11\\Documents\\........\\data\\facebook.csv'
numpy.inf IEEE 754 floating point representation of (positive) infinity.
import numpy as np np.set_printoptions(precision = 2) import pandas as pd pd.set_option('max_rows', 20) pd.options.display.max_rows = 100 pd.options.display.float_format = '{:,.2f}'.format
# How to see more data with pd.option_context('display.min_rows', 30, 'display.max_columns', 82): display( df.query('`ColumnA`.isna()'))
from google.colab import files
uploaded = files.upload()
from google.colab import drive drive.mount('/content/drive') import os os.chdir('/content/drive/My Drive/Colab Notebooks/data/') os.getcwd() os.listdir()
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip Restart the kernel import pandas as pd from pandas_profiling import ProfileReport profile = ProfileReport(df, title=’Heart Disease’, html={‘style’:{‘full_width’:True}}) profile.to_notebook_iframe()
I/O
The tarfile module makes it possible to read and write tar archives, including those using gzip, bz2 and lzma compression. Use the zipfile module to read or write .zip files, or the higher-level functions in shutil. import tarfile tar = tarfile.open("sample.tar.gz") tar.extractall(filter='data') tar.close()
The urllib.request module defines functions and classes which help in opening URLs (mostly HTTP) in a complex world — basic and digest authentication, redirections, cookies and more. urllib is part of the Python Standard Library and offers basic functionality for working with URLs and HTTP requests. urllib3 is a third-party library that provides a more extensive feature set for making HTTP requests and is suitable for more complex web interaction tasks. It must be installed separately. import urllib.request with urllib.request.urlopen('http://www.python.org/') as f: print(f.read(300))
with open('text.txt') as f: text_in = f.read() print(text_in)
import pandas as pd url = 'https://github.com/mattharrison/datasets/raw/master/data/ames-housing-dataset.zip' df = pd.read_csv(url, engine='pyarrow', dtype_backend='pyarrow')
#removing the file. os.remove("file_name.txt") file_name = "python.txt" os.rename(file_name,'Python1.txt')
# to tell the path
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/data/')
import sys print(sys.version) #Listing out all the paths import sys print(sys.path) # print out all modules imported sys.modules
Data Wrangling – Numpy / Pandas
np.random.seed(42) Setting the random seed is important for reproducibility. By fixing the seed to a specific value (in this case, 42), you ensure that the sequence of random numbers generated by NumPy will be the same every time you run your code, assuming that other sources of randomness in your code are also controlled or fixed.
X_b = np.c_[np.ones((100, 1)), X] # add x0 = 1 to each instance
>>> np.ones(5) array([1., 1., 1., 1., 1.])
>>> np.ones((2, 1)) array([[1.], [1.]])
>>> np.c_[np.array([1,2,3]), np.array([4,5,6])] array([[1, 4], [2, 5], [3, 6]])
df.dtypes
df.shape
df.iloc[3:5]
df.loc[(df["Col_A"] == "brown") & (df["Col_B"] == "blue"), ["Col_D", "Col_Z"]]
# data type conversion df.['year'].astype(str) df.['number'].astype(float) y = y.astype(np.uint8)
# We can alternatively use glob as this directly allows to include pathname matching. # For example if we only want Excel .xlsx files: data_path = os.path.join(os.getcwd(), 'data_folder') from glob import glob glob(os.path.join(data_path, '*.xlsx')) output: '/content/drive/MyDrive/Colab Notebooks/data/output.xlsx', '/content/drive/MyDrive/Colab Notebooks/data/beta_file.xlsx',
# index df = df.set_index("Column 1") df.index.name = "New Index" # to rename data.rename(columns={'OLD': 'NEW'}, inplace=True) # reformat column namess df.columns = df.columns.str.lower().str.replace(" ", "_") df["column_name"].str.lower() # to print memory usage memory_per_column = df.memory_usage(deep=True) / 1024 ** 2 df_cat = df.copy()
Slices can be passed by name using.loc[startrow:stoprow:step, startcolumn:stopcolumn:step]
or by position using.iloc[start:stop:step, start:stop:step]
. df.loc[2::10, "name":"gender"]
df.plot.scatter("X", "Y", alpha=0.5)
.isna() .notna() Always use pd.isnull() or pd.notnull() when conditioning on missing values. This is the most reliable. df[pd.isnull(df['Column1']) df[pd.notnull(df['Column2'])].head() df.query('ColumnA'.isna())
try: df = pd.read_csv(url, skiprows=0, header=0) print("Data loaded successfully.") except Exception: print("Error loading sheet") # If an error occurs, halt further execution raise
if __name__ == "__main__": main()
# use the break and continue statements for x in range(5,10): if (x == 7): break if (x % 2 == 0): continue print (x)
# Categoricals - Pandas 2 df.select_dtypes('string') # or 'strings[pyarrow]'
# Categoricals df.select_dtypes('string').describe().T
# Convert string columns to the `'category'` data type to save memory. (df .select_dtypes('string') .memory_usage(deep=True) .sum() ) (df .select_dtypes('string') .astype('category') .memory_usage(deep=True) .sum() )
# Missing numeric columns (and strings in Pandas 1) df.isna().mean().mul(100).pipe(lambda ser: ser[ser > 0])
# sample five rows df.sample(5)
# LAMBDA adult_data['Label'] = adult_data['Salary'].map(lambda x : 1 if '>50K' in x else 0) df.assign(Total_Salary = lambda x: df['Salary'] + df['Bonus'])
# SORT_VALUES oo.sort_values(by = ['Edition', 'Athlete']) # Count unique values oo['NOC].value_counts(ascending = True) oo['NOC].unique() oo[(oo['Medal']=='Gold') & (oo['Gender'] == 'Women')] oo[oo['Athlete'] == 'PHELPS, Michael']['Event'] oo["Athlete"].value_counts().sort_values(ascending = False).head(10) df['Label'].value_counts().plot(kind='bar') # or df.groupby('Label').size().plot(kind='bar') oo[oo.Athlete == 'PHELPS, Michael'][['Event', 'City','Edition']] #or oo[oo['Athlete'] == 'PHELPS, Michael'][['Event', 'City','Edition']]
# Delete a single column from the DataFrame data = data.drop(labels="deathes", axis=1) # Delete multiple columns from the DataFrame data = data.drop(labels=["deaths", "deaths_per_million"], axis=1) # Note that the "labels" parameter is by default the first, so # the above lines can be written slightly more concisely: data = data.drop("deaths", axis=1) data = data.drop(["deaths", "deaths_per_million"], axis=1) # Delete a single named column from the DataFrame data = data.drop(columns="cases") # Delete multiple named columns from the DataFrame data = data.drop(columns=["cases", "cases_per_million"]) pd.concat([s1, s2], axis=1) pd.concat([s1, s2], axis=1).reset_index() a.to_frame().join(b) # Delete column numbers 1, 2 and 5 from the DataFrame # Create a list of all column numbers to keep columns_to_keep = [x for x in range(data.shape[1]) if x not in [1,2,5]] # Delete columns by column number using iloc selection data = data.iloc[:, columns_to_keep] # delete a single row by index value 0 data = data.drop(labels=0, axis=0) # delete a few specified rows at index values 0, 15, 20. # Note that the index values do not always align to row numbers. data = data.drop(labels=[1,15,20], axis=0) # delete a range of rows - index values 10-20 data = data.drop(labels=range(40, 45), axis=0) # The labels parameter name can be omitted, and axis is 0 by default # Shorter versions of the above: data = data.drop(0) data = data.drop([0, 15, 20]) data = data.drop(range(10, 20)) data.shape output: (238, 11) # Delete everything but the first 99 rows. data = data[:100] data.shape output: (100, 11) ata = data[10:20] data.shape output: (10, 11)
# handling missing variables
df = df.append(another_row)
Plot
# In Matplotlib it is possible to change styling settings globally with runtime configuration (rc) parameters. # The default Matplotlib styling configuration is set with matplotlib.rcParams. # This is a dictionary containing formatting settings and their values. import matplotlib as mpl mpl.rcParams['figure.figsize'] = (15, 10) mpl.rcParams["font.family"] = "monospace" mpl.rcParams["font.family"] = "sans serif" # Matplotlib comes with a selection of available style sheets. # These define a range of plotting parameters and can be used to apply those parameters to your plots. import matplotlib.pyplot as plt plt.style.available plt.style.use("dark_background") plt.style.use("ggplot") plt.style.use('fivethirtyeight') plt.tight_layout()
number_list = [2,4,6,8,10,12]
print(number_list[::2])
output:
[2, 6, 10]
# check types df.dtypes # string to int df['string_col'] = df['string_col'].astype('int') # If you want to convert a column to numeric, I recommend to use df.to_numeric(): df['length'] = pd.to_numeric(df['length']) df['length'].dtypes
import multiprocessing multiprocessing.cpu_count()
import matplotlib.pyplot as plt import seaborn as sns sns.set_style('whitegrid') plt.style.use('seaborn')
import plotly.express as px
fig = px.line(df, x="lifeExp", y="gdpPercap")
fig.show()
import plotly.express as px
fig = px.line(x1, x2, width=1000, height=480, title = 'Returns')
fig.show()
# to show up within this notebook so we need to direct bokeh output to the notebook. import bokeh.io bokeh.io.output_notebook() from bokeh.plotting import figure, show p = figure(title='Returns', x_axis_label='Date', y_axis_label='GOOG', height=400, width=800) p.line(x, y1, color = 'firebrick', line_width=2) p.line(x, y2, color = 'navy', line_width=2) show(p)
import altair as alt base1= alt.Chart(m, width=800, height=400).encode(x='Date', y="GOOG_Returns") base2= alt.Chart(m, width=800, height=400).encode(x='Date', y="Strategy_Returns") base1.mark_line(color='gray') + base2.mark_line(color='navy')
https://www.tomasbeuzen.com/python-programming-for-data-science/chapters/chapter9-wrangling-advanced.html
Finance
import pandas_datareader.data as pdr import yfinance as yf yf.pdr_override() df = pdr.get_data_yahoo('TSLA AAPL NVDA', start = '2020-01-01', end = '2023-12-31')['Adj Close']
import yfinance as yf
import matplotlib.pyplot as plt
df = yf.download('TSLA AAPL AMD NVDA', start = '2020-01-01', end = '2023-12-31')['Adj Close']
df.divide(df.iloc[0]).plot()
plt.show()
tickers = ['SBUX', 'WMT', 'AMZN', 'HD'] mydata = pd.DataFrame() for t in tickers: mydata[t] = yf.download(t, start="2000-01-01", end="2022-05-31")['Adj Close'] mydata.head()
import pandas_datareader as pdr factors_ff3_monthly_raw = pdr.DataReader( name="F-F_Research_Data_Factors", data_source="famafrench", start=start_date, end=end_date)[0] factors_ff3_monthly = (factors_ff3_monthly_raw .divide(100) .reset_index(names="month") .assign(month=lambda x: pd.to_datetime(x["month"].astype(str))) .rename(str.lower, axis="columns") .rename(columns={"mkt-rf": "mkt_excess"}) )
# Initialize cumulative return
cumulative_return = 1
# Iterate through each year
for year in range(1926, 2010):
best_asset = HistRet.loc[year].idxmax() # Get the best-performing asset for the current year
asset_return = HistRet.loc[year, best_asset] # Get the return of the best-performing asset for the current year
cumulative_return *= (1 + asset_return) # Update cumulative return
print("Final cumulative dollar return:", '${:,.2f}'.format(cumulative_return))
STAT
import statsmodels.formula as sm X = sm.add_constant(data['X']) model = sm.OLS(data['Y'], X).fit() print(model.summary()) beta = np.linalg.inv(X.T.dot(X)).dot(X.T.dot(y)) pd.Series(beta, index=X.columns)
https://colab.research.google.com/notebooks/charts.ipynb#scrollTo=N-u5cYwpS-y0 from bokeh.io import output_notebook output_notebook()
# Create dependent and independent variables, intercept, dummies import patsy as pts y, x = pts.dmatrices('aapl ~ index', data=df, return_type='dataframe') import statsmodels.formula.api as smf result = smf.ols(formula='aapl ~ index', data=df).fit() result.summary()
Sklearn
모듈 | 설명 |
---|---|
sklearn.datasets |
내장된 예제 데이터 세트 |
sklearn.preprocessing |
다양한 데이터 전처리 기능 제공 (변환, 정규화, 스케일링 등) |
sklearn.feature_selection |
특징(feature)를 선택할 수 있는 기능 제공 |
sklearn.feature_extraction |
특징(feature) 추출에 사용 |
sklearn.decomposition |
차원 축소 관련 알고리즘 지원 (PCA, NMF, Truncated SVD 등) |
sklearn.model_selection |
교차 검증을 위해 데이터를 학습/테스트용으로 분리, 최적 파라미터를 추출하는 API 제공 (GridSearch 등) |
sklearn.metrics |
분류, 회귀, 클러스터링, Pairwise에 대한 다양한 성능 측정 방법 제공 (Accuracy, Precision, Recall, ROC-AUC, RMSE 등) |
sklearn.pipeline |
특징 처리 등의 변환과 ML 알고리즘 학습, 예측 등을 묶어서 실행할 수 있는 유틸리티 제공 |
sklearn.linear_model |
선형 회귀, 릿지(Ridge), 라쏘(Lasso), 로지스틱 회귀 등 회귀 관련 알고리즘과 SGD(Stochastic Gradient Descent) 알고리즘 제공 |
sklearn.svm |
서포트 벡터 머신 알고리즘 제공 |
sklearn.neighbors |
최근접 이웃 알고리즘 제공 (k-NN 등) |
sklearn.naive_bayes |
나이브 베이즈 알고리즘 제공 (가우시안 NB, 다항 분포 NB 등) |
sklearn.tree |
의사 결정 트리 알고리즘 제공 |
sklearn.ensemble |
앙상블 알고리즘 제공 (Random Forest, AdaBoost, GradientBoost 등) |
sklearn.cluster |
비지도 클러스터링 알고리즘 제공 (k-Means, 계층형 클러스터링, DBSCAN 등) |
#Loading the data
from sklearn import datasets
X, y = datasets.load_wine(return_X_y=True) # Classification
from sklearn import datasets
diabetes = datasets.load_diabetes() # Regression
X, y = diabetes.data, diabetes.target
# Split to training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
#Preprocessing
Standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
Normalization
from sklearn.preprocessing import Normalizer
norm = Normalizer()
norm_X_train = norm.fit_transform(X_train)
norm_X_test = norm.transform(X_test)
Binarization
from sklearn.preprocessing import Binarizer
binary = Binarizer(threshold=0.0)
binary_X = binary.fit_transform(X)
Encoding Categorical Features
Encode categorical features with string value
from sklearn.preprocessing import LabelEncoder
lab_enc = LabelEncoder()
y = lab_enc.fit_transform(y)
Imputer
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=0, strategy='mean')
imp_mean.fit_transform(X_train)
Generating Polynomial Features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)
poly.fit_transform(X)
# FIT THE SUPERVISED LEARNING MODEL
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
lr = LinearRegression()
lr.fit(X_train, y_train)
knn.fit(X_train, y_train)
svm_svc = SVC(kernel='linear')
svm_svc.fit(X_train, y_train)
gnb = GaussianNB()
# PREDICT Supervised Estimators
y_pred = lr.predict(X_test)
y_pred = svm_svc.predict(X_test)
y_pred = knn.predict_proba(X_test) #Estimate probability of a label
# FIT THE UNSUPERVISED LEARNING MODEL
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=0)
kemans_model = k_means.fit(X_train) #Fit the model to the data
pca = PCA(n_components=2)
pca_model = pca.fit_transform(X_train) #Fit to data, then transform it
# PREDICT Unsupervised Estimators
y_pred = kmeans_model.predict(X_test) #Predict labels in clustering algos
# FIT THE NN LEARNING MODEL
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
# EVAULATION from sklearn import metrics Classfication Accuracy score lr.score(X_test, y_test) from sklearn.metrics import accuracy_score accuracy_score(y_test, y_pred) Accuracy Score >>> knn.score(X_test, y_test) #Estimator score method >>> from sklearn.metrics import accuracy_score #Metric scoring functions Classification Report >>> accuracy_score(y_test, y_pred) Classification Report >>> from sklearn.metrics import classification_report >>> print(classification_report(y_test, y_pred)) #Precision, recall, f1-scoreand support Confusion Matrix >>> from sklearn.metrics import confusion_matrix >>> print(confusion_matrix(y_test, y_pred)) Regression Metrics Mean Squared Error from sklearn.metrics import mean_squared_error mean_squared_error(y_test, y_pred) Mean Absolute Error >>> from sklearn.metrics import mean_absolute_error >>> y_true = [3, -0.5, 2] >>> mean_absolute_error(y_true, y_pred) R2 Score from sklearn.metrics import r2_score r2_score(y_test, y_pred) Clustering Metrics Adjusted Rand Index >>> from sklearn.metrics import adjusted_rand_score >>> adjusted_rand_score(y_true, y_pred) Homogeneity >>> from sklearn.metrics import homogeneity_score >>> homogeneity_score(y_true, y_pred) V-measure >>> from sklearn.metrics import v_measure_score >>> metrics.v_measure_score(y_true, y_pred)
# Cross-validation from sklearn.model_selection import cross_val_score cross_val_score(lr, X, y, cv=5, scoring='f1_macro') from sklearn.cross_validation import cross_val_score print(cross_val_score(knn, X_train, y_train, cv=4)) print(cross_val_score(lr, X, y, cv=2))
# Model tuning Grid Search from sklearn.model_selection import GridSearchCV parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} model = GridSearchCV(svm_svc, parameters) model.fit(X_train, y_train) print(model.best_score_) print(model.best_estimator_) >>> from sklearn.grid_search import GridSearchCV >>> params = { : np.arange(1,3), : [ , ]} >>> grid = GridSearchCV(estimator=knn, param_grid=params) >>> grid.fit(X_train, y_train) >>> print(grid.best_score_) >>> print(grid.best_estimator_.n_neighbors) Randomized Parameter Optimization >>> from sklearn.grid_search import RandomizedSearchCV >>> params = { : range(1,5), : [ , ]} >>> rsearch = RandomizedSearchCV(estimator=knn, param_distributions=params, cv=4, n_iter=8,random_state=5) >>> rsearch.fit(X_train, y_train) >>> print(rsearch.best_score_)
# Perform variance thresholding on raw features from sklearn.feature_selection import VarianceThreshold
# replace each attribute’s missing values with the median of that attribute: from sklearn.impute import SimpleImputer imputer = SimpleImputer(strategy="median")
# convert these categories from text to numbers from sklearn.preprocessing import OrdinalEncoder data_encoded = OrdinalEncoder().fit_transform(data) # OneHotEncoder class to convert categorical values into one-hot vectors from sklearn.preprocessing import OneHotEncoder data_encoded = OneHotEncoder().fit_transform(data)
Sklearn Example by Algorithm
# linear regression
# training
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(train_x, train_y)
print ('Intercept: ', regr.intercept_)
print ('Coefficients: ', regr.coef_)
# prediction
test_y_ = regr.predict(test_x)
# evaluatioin
from sklearn.metrics import r2_score
print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_ - test_y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((test_y_ - test_y) ** 2))
print("R2-score: %.2f" % r2_score(test_y , test_y_) )
# Polynomial regression
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)
lin_reg = LinearRegression()
lin_reg.fit(X_poly, y)
lin_reg.intercept_, lin_reg.coef_
# stochastic gradient dscent
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1)
sgd_reg.fit(X, y.ravel())
# Ridge from sklearn.linear_model import Ridge ridge_reg = Ridge(alpha=1, solver="cholesky") ridge_reg.fit(X, y) ridge_reg.predict([[1.5]] # Lasso from sklearn.linear_model import Lasso lasso_reg = Lasso(alpha=0.1) lasso_reg.fit(X, y) lasso_reg.predict([[1.5]]
# Elastic net
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X, y)
elastic_net.predict([[1.5]])
# Logistic regresion from sklearn.linear_model import LogisticRegression logi_reg = LogisticRegression() logi_reg.fit(X, y) logi_reg.predict([[1.7], [1.5]]) # logistic with GridSearch from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV lr = LogisticRegression() parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]} cv = GridSearchCV(lr, parameters, cv=5) cv.fit(features, labels)
# Softmax regression
softmax_reg = LogisticRegression(multi_class="multinomial",solver="lbfgs", C=10)
softmax_reg.fit(X, y)
softmax_reg.predict([[5, 2]])
softmax_reg.predict_proba([[5, 2]])
# SVC
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svc = SVC()
parameters = {'kernel': ['linear', 'rbf'],'C': [0.1, 1, 10]}
cv = GridSearchCV(svc, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())
# KNN
# preprocessing
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
# training
from sklearn.neighbors import KNeighborsClassifier
k = 4
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
neigh
# prediction
yhat = neigh.predict(X_test)
# evaluation
sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
# KNN example
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()
X, y = iris.data[:, :2], iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)
output:
0.631578947368421
# multi-layer perceptron - classical feed forward artificial neural network
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
parameters = {'hidden_layer_sizes': [(10,), (50,), (100,)],
'activation': ['relu', 'tanh', 'logistic'],
'learning_rate': ['constant', 'invscaling', 'adaptive']
}
cv = GridSearchCV(mlp, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())
cv.best_estimator_
# Random Forest - a collection of indepedent dection trees to improve
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier()
parameters = {
'n_estimators': [5, 50, 250],
'max_depth': [2, 4, 8, 16, 32, None]
}
cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())
# Ensemble - Boosting
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
gb = GradientBoostingClassifier()
parameters = {
'n_estimators': [5, 50, 250, 500],
'max_depth': [1, 3, 5, 7, 9],
'learning_rate': [0.01, 0.1, 1, 10, 100]
}
cv = GridSearchCV(gb, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())
# decision tree
from sklearn.tree import DecisionTreeRegressor
# Split our data into a training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=1)
# training
regression_tree = DecisionTreeRegressor(criterion = 'mse')
regression_tree.fit(X_train, Y_train)
# evaluation / prediction
regression_tree.score(X_test, Y_test)
prediction = regression_tree.predict(X_test)
print("$",(prediction - Y_test).abs().mean()*1000)
# K means - clustering
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
customers_scaled = scaler.fit_transform(customers[['Income', 'SpendingScore']])
from sklearn.cluster import KMeans
km = KMeans(n_clusters = 3, n_init = 25, random_state = 1234)
km.labels_
km.inertia_
km.cluster_centers_
# determining k
wcss=[]; wcss.append(km.inertia_)
silhouette=[]; silhouette.append(silhouette_score(customers_scaled, km.labels_))
calinski=[]; calinski.append(calinski_harabasz_score(customers_scaled, km.labels_))
DEEP LEARNING
from tensorflow.keras.models import Sequential from tensorflow.keras.layers import LSTM from tensorflow.keras.layers import Dense from tensorflow.keras.layers import Flatten from tensorflow.keras.layers import TimeDistributed from tensorflow.keras.layers import Conv1D from tensorflow.keras.layers import MaxPooling1D
### Example 1 #feature normalization (or scaling) normalized_feature = keras.utils.normalize(X.values) # Import train_test_split function from sklearn.model_selection from sklearn.model_selection import train_test_split # Split up the data into a training set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101) # Build the Network from tensorflow import keras from keras.models import Sequential #from tensorflow.keras.models import Sequential from keras.layers import Dense ## Build Model (Building a three layer network - with one hidden layer) model = Sequential() model.add(Dense(4, input_dim=4, activation ='relu')) # You don't have to specify input size. Just define the hidden layers model.add(Dense(3, activation='relu')) model.add(Dense(1)) # Compile Model model.compile(optimizer='adam', loss='mse', metrics=['mse']) # Fit the Model history = model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 32) #inspect the model model.summary() model.evaluate(X_test, y_test)[1] # predict SALES using the test data test_predictions = model.predict(X_test).flatten()
NLP / Text Analysis
# to tokenize text.split() # to create counter import collections collections.Counter() collections.Counter().most_common(10) # to remove special characters import re re.sub(r'[^\w]',' ', text) text.lower() import nltk nltk.download('stopwords') from nltk.corpus import stopwords stop_words = stopwords.words('english') for word in words: if word not in stop_words: words_no_stop.append(word) from nltk.stem import PorterStemmer for word in words_no_stop: words_clean.append(PorterStemer().stem(word)) from sklearn.feature_extraction.text import TfidfVectorizer # Build a vocabulary from our training text and transform training text training_dtm_tf = TfidfVectorizer(stop_words='english').fit_transform(training_text)