Classes

My class website at Texas State: yiclass.wp.txstate.edu

 

MY CHEAT SHEETS for Python coding

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all" 
!pip install pandas==2.2.1  # restart the session
!pip install pyarrow
help(sum) 

import warnings 
warnings.filterwarnings('ignore')  

import pandas as pd 
pd.options.mode.chained_assignment = None # default='warn'
# Jupyter uses forward slashes to access folders in path. For example., 
path = 'C:/Users/hy11/Documents/......../data/facebook.csv' 
# or path = 'C:\\Users\\hy11\\Documents\\........\\data\\facebook.csv'

numpy.inf 
IEEE 754 floating point representation of (positive) infinity.
import numpy as np
np.set_printoptions(precision = 2)

import pandas as pd
pd.set_option('max_rows', 20)
pd.options.display.max_rows = 100
pd.options.display.float_format = '{:,.2f}'.format


# How to see more data 
with pd.option_context('display.min_rows', 30, 'display.max_columns', 82): 
  display( df.query('`ColumnA`.isna()'))
from google.colab import files 
uploaded = files.upload()
from google.colab import drive 
drive.mount('/content/drive') 

import os 
os.chdir('/content/drive/My Drive/Colab Notebooks/data/') 
os.getcwd() 
os.listdir()
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

 Restart the kernel

import pandas as pd
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title=’Heart Disease’, html={‘style’:{‘full_width’:True}})
profile.to_notebook_iframe()

I/O

The tarfile module makes it possible to read and write tar archives, including those 
using gzip, bz2 and lzma compression. 
Use the zipfile module to read or write .zip  files, or the higher-level functions 
in shutil.

import tarfile
tar = tarfile.open("sample.tar.gz")
tar.extractall(filter='data')
tar.close()
The urllib.request module defines functions and classes which help in opening URLs (mostly HTTP) in a complex world — basic and digest authentication, redirections, cookies and more.
urllib is part of the Python Standard Library and offers basic functionality for working with URLs and HTTP requests. 
urllib3 is a third-party library that provides a more extensive feature set for making HTTP requests and is suitable for more complex web interaction tasks. It must be installed separately.

import urllib.request 
with urllib.request.urlopen('http://www.python.org/') as f:
     print(f.read(300))
with open('text.txt') as f: 
     text_in = f.read() 

print(text_in)
import pandas as pd 
url = 'https://github.com/mattharrison/datasets/raw/master/data/ames-housing-dataset.zip' 
df = pd.read_csv(url, engine='pyarrow', dtype_backend='pyarrow')
#removing the file. 
os.remove("file_name.txt")

file_name = "python.txt" 
os.rename(file_name,'Python1.txt')

# to tell the path 
import sys 
sys.path.append('/content/drive/My Drive/Colab Notebooks/data/')

import sys
print(sys.version)

#Listing out all the paths
import sys
print(sys.path)

# print out all modules imported
sys.modules

Data Wrangling – Numpy / Pandas

np.random.seed(42)

Setting the random seed is important for reproducibility. 
By fixing the seed to a specific value (in this case, 42), 
you ensure that the sequence of random numbers generated 
by NumPy will be the same every time you run your code, 
assuming that other sources of randomness in your code are 
also controlled or fixed.
X_b = np.c_[np.ones((100, 1)), X]    # add x0 = 1 to each instance
>>> np.ones(5)
array([1., 1., 1., 1., 1.])

>>> np.ones((2, 1))
array([[1.],
       [1.]])

>>> np.c_[np.array([1,2,3]), np.array([4,5,6])]
array([[1, 4],
       [2, 5],
       [3, 6]])
 
 
df.dtypes
df.shape
df.iloc[3:5]
df.loc[(df["Col_A"] == "brown") & (df["Col_B"] == "blue"), ["Col_D", "Col_Z"]]

# data type conversion 
df.['year'].astype(str) 
df.['number'].astype(float)

y = y.astype(np.uint8)
# We can alternatively use glob as this directly allows to include pathname matching. 
# For example if we only want Excel .xlsx files:

data_path = os.path.join(os.getcwd(), 'data_folder')

from glob import glob 
glob(os.path.join(data_path, '*.xlsx'))

output: 

'/content/drive/MyDrive/Colab Notebooks/data/output.xlsx',  
'/content/drive/MyDrive/Colab Notebooks/data/beta_file.xlsx',  

# index

df = df.set_index("Column 1")

df.index.name = "New Index"

# to rename 
data.rename(columns={'OLD': 'NEW'}, inplace=True)  

# reformat column namess 
df.columns = df.columns.str.lower().str.replace(" ", "_")
df["column_name"].str.lower() 

# to print memory usage 
memory_per_column = df.memory_usage(deep=True) / 1024 ** 2 

df_cat = df.copy()
Slices can be passed by name using 
.loc[startrow:stoprow:step, startcolumn:stopcolumn:step] or  by position using .iloc[start:stop:step, start:stop:step]. df.loc[2::10, "name":"gender"]
df.plot.scatter("X", "Y", alpha=0.5)
.isna()
.notna()

Always use pd.isnull() or pd.notnull() when conditioning on missing values. This is the most reliable.
df[pd.isnull(df['Column1'])
df[pd.notnull(df['Column2'])].head()

df.query('ColumnA'.isna())

try:
     df = pd.read_csv(url, skiprows=0, header=0)
     print("Data loaded successfully.")
except Exception:
     print("Error loading sheet")
     # If an error occurs, halt further execution
     raise
if __name__ == "__main__":
     main()
# use the break and continue statements
for x in range(5,10):
    if (x == 7): break
    if (x % 2 == 0): continue
    print (x)

 

# Categoricals - Pandas 2
df.select_dtypes('string')  # or 'strings[pyarrow]'
# Categoricals
df.select_dtypes('string').describe().T

# Convert string columns to the `'category'` data type to save memory.
(df
.select_dtypes('string')
.memory_usage(deep=True)
.sum()
)


(df
.select_dtypes('string')
.astype('category')
.memory_usage(deep=True)
.sum()
)
# Missing numeric columns (and strings in Pandas 1)

df.isna().mean().mul(100).pipe(lambda ser: ser[ser > 0])
 
# sample five rows
df.sample(5)
# LAMBDA 
adult_data['Label'] = adult_data['Salary'].map(lambda x : 1 if '>50K' in x else 0) 
df.assign(Total_Salary = lambda x: df['Salary'] + df['Bonus'])

# SORT_VALUES 
oo.sort_values(by = ['Edition', 'Athlete']) 

# Count unique values 
oo['NOC].value_counts(ascending = True) oo['NOC].unique() 
oo[(oo['Medal']=='Gold') & (oo['Gender'] == 'Women')] 
oo[oo['Athlete'] == 'PHELPS, Michael']['Event'] 
oo["Athlete"].value_counts().sort_values(ascending = False).head(10) 
df['Label'].value_counts().plot(kind='bar') # or df.groupby('Label').size().plot(kind='bar') 
oo[oo.Athlete == 'PHELPS, Michael'][['Event', 'City','Edition']]  #or oo[oo['Athlete'] == 'PHELPS, Michael'][['Event', 'City','Edition']] 


# Delete a single column from the DataFrame 
data = data.drop(labels="deathes", axis=1) 

# Delete multiple columns from the DataFrame 
data = data.drop(labels=["deaths", "deaths_per_million"], axis=1) 

# Note that the "labels" parameter is by default the first, so # the above lines can be written slightly more concisely: 
data = data.drop("deaths", axis=1) 
data = data.drop(["deaths", "deaths_per_million"], axis=1) 

# Delete a single named column from the DataFrame 
data = data.drop(columns="cases") 

# Delete multiple named columns from the DataFrame 
data = data.drop(columns=["cases", "cases_per_million"]) 
pd.concat([s1, s2], axis=1) 
pd.concat([s1, s2], axis=1).reset_index() a.to_frame().join(b) 

# Delete column numbers 1, 2 and 5 from the DataFrame # Create a list of all column numbers to keep 
columns_to_keep = [x for x in range(data.shape[1]) if x not in [1,2,5]] 

# Delete columns by column number using iloc selection 
data = data.iloc[:, columns_to_keep] 

# delete a single row by index value 0
 data = data.drop(labels=0, axis=0) 

# delete a few specified rows at index values 0, 15, 20. 
# Note that the index values do not always align to row numbers. 
data = data.drop(labels=[1,15,20], axis=0) 

# delete a range of rows - index values 10-20 
data = data.drop(labels=range(40, 45), axis=0)  

# The labels parameter name can be omitted, and axis is 0 by default 
# Shorter versions of the above: 
data = data.drop(0) 
data = data.drop([0, 15, 20]) 
data = data.drop(range(10, 20)) 
data.shape  
output: (238, 11) 

# Delete everything but the first 99 rows. 
data = data[:100] 
data.shape  
output: (100, 11) 

ata = data[10:20] 
data.shape  
output: (10, 11)

# handling missing variables
df = df.append(another_row)

Plot

# In Matplotlib it is possible to change styling settings globally with runtime configuration (rc) parameters. 
# The default Matplotlib styling configuration is set with matplotlib.rcParams. 
# This is a dictionary containing formatting settings and their values. 
import matplotlib as mpl 
mpl.rcParams['figure.figsize'] = (15, 10) 
mpl.rcParams["font.family"] = "monospace"
mpl.rcParams["font.family"] = "sans serif"

# Matplotlib comes with a selection of available style sheets. 
# These define a range of plotting parameters and can be used to apply those parameters to your plots. 
import matplotlib.pyplot as plt 
plt.style.available 
plt.style.use("dark_background") 
plt.style.use("ggplot")
plt.style.use('fivethirtyeight')

plt.tight_layout()
number_list = [2,4,6,8,10,12]
print(number_list[::2])

output:
[2, 6, 10]

# check types
df.dtypes

# string to int 
df['string_col'] = df['string_col'].astype('int')

# If you want to convert a column to numeric, I recommend to use df.to_numeric(): 
df['length'] = pd.to_numeric(df['length']) 
df['length'].dtypes


import multiprocessing
multiprocessing.cpu_count()


import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set_style('whitegrid')
plt.style.use('seaborn')
import plotly.express as px
fig = px.line(df, x="lifeExp", y="gdpPercap")
fig.show()

import plotly.express as px
fig = px.line(x1, x2, width=1000, height=480, title = 'Returns')
fig.show()
# to show up within this notebook so we need to direct bokeh output to the notebook.
import bokeh.io
bokeh.io.output_notebook()

from bokeh.plotting import figure, show
p = figure(title='Returns', x_axis_label='Date', y_axis_label='GOOG', height=400, width=800)
p.line(x, y1, color = 'firebrick', line_width=2)
p.line(x, y2, color = 'navy', line_width=2)
show(p)

import altair as alt
base1= alt.Chart(m, width=800, height=400).encode(x='Date', y="GOOG_Returns")
base2= alt.Chart(m, width=800, height=400).encode(x='Date', y="Strategy_Returns")
base1.mark_line(color='gray') + base2.mark_line(color='navy')
https://www.tomasbeuzen.com/python-programming-for-data-science/chapters/chapter9-wrangling-advanced.html

Finance

import pandas_datareader.data as pdr
import yfinance as yf 
yf.pdr_override() 
df = pdr.get_data_yahoo('TSLA AAPL NVDA', start = '2020-01-01', end = '2023-12-31')['Adj Close']

import yfinance as yf
import matplotlib.pyplot as plt
df = yf.download('TSLA AAPL AMD NVDA', start = '2020-01-01', end = '2023-12-31')['Adj Close']
df.divide(df.iloc[0]).plot()
plt.show()
tickers = ['SBUX', 'WMT', 'AMZN', 'HD']
mydata = pd.DataFrame()
for t in tickers:
    mydata[t] = yf.download(t, start="2000-01-01", end="2022-05-31")['Adj Close']

mydata.head()
import pandas_datareader as pdr

factors_ff3_monthly_raw = pdr.DataReader(
name="F-F_Research_Data_Factors",
data_source="famafrench", 
start=start_date, 
end=end_date)[0]

factors_ff3_monthly = (factors_ff3_monthly_raw
.divide(100)
.reset_index(names="month")
.assign(month=lambda x: pd.to_datetime(x["month"].astype(str)))
.rename(str.lower, axis="columns")
.rename(columns={"mkt-rf": "mkt_excess"})
)


# Initialize cumulative returncumulative_return = 1
# Iterate through each yearfor year in range(1926, 2010):      best_asset = HistRet.loc[year].idxmax()       # Get the best-performing asset for the current year      asset_return = HistRet.loc[year, best_asset]  # Get the return of the best-performing asset for the current year      cumulative_return *= (1 + asset_return)       # Update cumulative return      print("Final cumulative dollar return:", '${:,.2f}'.format(cumulative_return))

STAT

import statsmodels.formula as sm

X = sm.add_constant(data['X'])
model = sm.OLS(data['Y'], X).fit()
print(model.summary())

beta = np.linalg.inv(X.T.dot(X)).dot(X.T.dot(y))
pd.Series(beta, index=X.columns)


https://colab.research.google.com/notebooks/charts.ipynb#scrollTo=N-u5cYwpS-y0

from bokeh.io import output_notebook
output_notebook()

# Create dependent and independent variables, intercept, dummies
import patsy as pts  
y, x = pts.dmatrices('aapl ~ index', data=df, return_type='dataframe')

import statsmodels.formula.api as smf

result = smf.ols(formula='aapl ~ index', data=df).fit()
result.summary()

Sklearn

 

모듈 설명
sklearn.datasets 내장된 예제 데이터 세트
sklearn.preprocessing 다양한 데이터 전처리 기능 제공 (변환, 정규화, 스케일링 등)
sklearn.feature_selection 특징(feature)를 선택할 수 있는 기능 제공
sklearn.feature_extraction 특징(feature) 추출에 사용
sklearn.decomposition 차원 축소 관련 알고리즘 지원 (PCA, NMF, Truncated SVD 등)
sklearn.model_selection 교차 검증을 위해 데이터를 학습/테스트용으로 분리, 최적 파라미터를 추출하는 API 제공 (GridSearch 등)
sklearn.metrics 분류, 회귀, 클러스터링, Pairwise에 대한 다양한 성능 측정 방법 제공 (Accuracy, Precision, Recall, ROC-AUC, RMSE 등)
sklearn.pipeline 특징 처리 등의 변환과 ML 알고리즘 학습, 예측 등을 묶어서 실행할 수 있는 유틸리티 제공
sklearn.linear_model 선형 회귀, 릿지(Ridge), 라쏘(Lasso), 로지스틱 회귀 등 회귀 관련 알고리즘과 SGD(Stochastic Gradient Descent) 알고리즘 제공
sklearn.svm 서포트 벡터 머신 알고리즘 제공
sklearn.neighbors 최근접 이웃 알고리즘 제공 (k-NN 등)
sklearn.naive_bayes 나이브 베이즈 알고리즘 제공 (가우시안 NB, 다항 분포 NB 등)
sklearn.tree 의사 결정 트리 알고리즘 제공
sklearn.ensemble 앙상블 알고리즘 제공 (Random Forest, AdaBoost, GradientBoost 등)
sklearn.cluster 비지도 클러스터링 알고리즘 제공 (k-Means, 계층형 클러스터링, DBSCAN 등)

#Loading the data
from sklearn import datasets 
X, y = datasets.load_wine(return_X_y=True)       # Classification 

from sklearn import datasets 
diabetes = datasets.load_diabetes()              # Regression 
X, y = diabetes.data, diabetes.target

 

# Split to training and testing
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)


#Preprocessing

Standardization 
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler() 
scaled_X_train = scaler.fit_transform(X_train) 
scaled_X_test = scaler.transform(X_test) 

Normalization 
from sklearn.preprocessing import Normalizer 
norm = Normalizer() 
norm_X_train = norm.fit_transform(X_train) 
norm_X_test = norm.transform(X_test) 

Binarization 
from sklearn.preprocessing import Binarizer 
binary = Binarizer(threshold=0.0) 
binary_X = binary.fit_transform(X) 

Encoding Categorical Features
Encode categorical features with string value 
from sklearn.preprocessing import LabelEncoder 
lab_enc = LabelEncoder()
y = lab_enc.fit_transform(y) 

Imputer 
from sklearn.impute import SimpleImputer 
imp_mean = SimpleImputer(missing_values=0, strategy='mean') 
imp_mean.fit_transform(X_train)

Generating Polynomial Features
from sklearn.preprocessing import PolynomialFeatures 
poly = PolynomialFeatures(5)
poly.fit_transform(X)
# FIT THE SUPERVISED LEARNING MODEL
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC 

lr = LinearRegression()
lr.fit(X_train, y_train) 

knn.fit(X_train, y_train) 

svm_svc = SVC(kernel='linear')
svm_svc.fit(X_train, y_train)

gnb = GaussianNB()


# PREDICT Supervised Estimators 
y_pred = lr.predict(X_test) 
y_pred = svm_svc.predict(X_test)
y_pred = knn.predict_proba(X_test)             #Estimate probability of a label 

# FIT THE UNSUPERVISED LEARNING MODEL
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=0)
kemans_model = k_means.fit(X_train) #Fit the model to the data
pca = PCA(n_components=2)
pca_model = pca.fit_transform(X_train)      #Fit to data, then transform it

# PREDICT Unsupervised Estimators        
y_pred = kmeans_model.predict(X_test)       #Predict labels in clustering algos

# FIT THE NN LEARNING MODEL
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
# EVAULATION
from sklearn import metrics

Classfication
Accuracy score
lr.score(X_test, y_test) 
from sklearn.metrics import accuracy_score 
accuracy_score(y_test, y_pred) 

Accuracy Score 
>>> knn.score(X_test, y_test)                      #Estimator score method
>>> from sklearn.metrics import accuracy_score     #Metric scoring functions Classification Report
>>> accuracy_score(y_test, y_pred) 

Classification Report 
>>> from sklearn.metrics import classification_report 
>>> print(classification_report(y_test, y_pred))   #Precision, recall, f1-scoreand support 

Confusion Matrix 
>>> from sklearn.metrics import confusion_matrix 
>>> print(confusion_matrix(y_test, y_pred))

Regression Metrics
Mean Squared Error 
from sklearn.metrics import mean_squared_error 
mean_squared_error(y_test, y_pred) 

Mean Absolute Error 
>>> from sklearn.metrics import mean_absolute_error 
>>> y_true = [3, -0.5, 2] 
>>> mean_absolute_error(y_true, y_pred)

R2 
Score from sklearn.metrics import r2_score 
r2_score(y_test, y_pred) 

Clustering Metrics
Adjusted Rand Index 
>>> from sklearn.metrics import adjusted_rand_score 
>>> adjusted_rand_score(y_true, y_pred) 

Homogeneity 
>>> from sklearn.metrics import homogeneity_score 
>>> homogeneity_score(y_true, y_pred) 

V-measure 
>>> from sklearn.metrics import v_measure_score 
>>> metrics.v_measure_score(y_true, y_pred)
# Cross-validation
from sklearn.model_selection import cross_val_score 
cross_val_score(lr, X, y, cv=5, scoring='f1_macro')

from sklearn.cross_validation import cross_val_score
print(cross_val_score(knn, X_train, y_train, cv=4))
print(cross_val_score(lr, X, y, cv=2))
# Model tuning
Grid Search
from sklearn.model_selection import GridSearchCV 
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} 
model = GridSearchCV(svm_svc, parameters) 
model.fit(X_train, y_train) 
print(model.best_score_) 
print(model.best_estimator_)

>>> from sklearn.grid_search import GridSearchCV 
>>> params = { : np.arange(1,3), : [ , ]} 
>>> grid = GridSearchCV(estimator=knn, param_grid=params) 
>>> grid.fit(X_train, y_train) >>> print(grid.best_score_) 
>>> print(grid.best_estimator_.n_neighbors)

Randomized Parameter Optimization
>>> from sklearn.grid_search import RandomizedSearchCV 
>>> params = { : range(1,5), : [ , ]} 
>>> rsearch = RandomizedSearchCV(estimator=knn, param_distributions=params, cv=4, n_iter=8,random_state=5) 
>>> rsearch.fit(X_train, y_train) >>> print(rsearch.best_score_)
# Perform variance thresholding on raw features
from sklearn.feature_selection import VarianceThreshold


# replace each attribute’s missing values with the median of that
attribute:

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
# convert these categories from text to numbers
from sklearn.preprocessing import OrdinalEncoder
data_encoded = OrdinalEncoder().fit_transform(data)

# OneHotEncoder class to convert categorical values into one-hot vectors
from sklearn.preprocessing import OneHotEncoder
data_encoded = OneHotEncoder().fit_transform(data)

Sklearn Example by Algorithm

# linear regression

# training
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(train_x, train_y)

print ('Intercept: ', regr.intercept_)
print ('Coefficients: ', regr.coef_)

# prediction
test_y_ = regr.predict(test_x)

# evaluatioin
from sklearn.metrics import r2_score

print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_ - test_y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((test_y_ - test_y) ** 2))
print("R2-score: %.2f" % r2_score(test_y , test_y_) )
# Polynomial regression
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X) 

lin_reg = LinearRegression()
lin_reg.fit(X_poly, y)
lin_reg.intercept_, lin_reg.coef_
# stochastic gradient dscent

from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1)
sgd_reg.fit(X, y.ravel())
# Ridge
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1, solver="cholesky")
ridge_reg.fit(X, y)
ridge_reg.predict([[1.5]]


# Lasso
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X, y)
lasso_reg.predict([[1.5]]
# Elastic net
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X, y)
elastic_net.predict([[1.5]])

# Logistic regresion
from sklearn.linear_model import LogisticRegression
logi_reg = LogisticRegression()
logi_reg.fit(X, y) 
logi_reg.predict([[1.7], [1.5]]) 

# logistic with GridSearch
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
lr = LogisticRegression()
parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(features, labels)
# Softmax regression
softmax_reg = LogisticRegression(multi_class="multinomial",solver="lbfgs", C=10)
softmax_reg.fit(X, y) 
softmax_reg.predict([[5, 2]])
softmax_reg.predict_proba([[5, 2]])
# SVC
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svc = SVC()
parameters = {'kernel': ['linear', 'rbf'],'C': [0.1, 1, 10]}
cv = GridSearchCV(svc, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())
# KNN 
# preprocessing
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)

# training
from sklearn.neighbors import KNeighborsClassifier
k = 4
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
neigh


# prediction
yhat = neigh.predict(X_test)

# evaluation
sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

# KNN example
from sklearn import neighbors, datasets, preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 

iris = datasets.load_iris() 
X, y = iris.data[:, :2], iris.target 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33) 
scaler = preprocessing.StandardScaler().fit(X_train) 
X_train = scaler.transform(X_train) 
X_test = scaler.transform(X_test) 

knn = neighbors.KNeighborsClassifier(n_neighbors=5) 
knn.fit(X_train, y_train) 

y_pred = knn.predict(X_test) 
accuracy_score(y_test, y_pred)

output: 
0.631578947368421
# multi-layer perceptron - classical feed forward artificial neural network
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
parameters = {'hidden_layer_sizes': [(10,), (50,), (100,)],
     'activation': ['relu', 'tanh', 'logistic'],
     'learning_rate': ['constant', 'invscaling', 'adaptive']
}
cv = GridSearchCV(mlp, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())
cv.best_estimator_
# Random Forest - a collection of indepedent dection trees to improve
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None]
}
cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())
# Ensemble - Boosting 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators': [5, 50, 250, 500],
    'max_depth': [1, 3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 1, 10, 100]
}
cv = GridSearchCV(gb, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())

 

# decision tree
from sklearn.tree import DecisionTreeRegressor

# Split our data into a training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=1)

# training
regression_tree = DecisionTreeRegressor(criterion = 'mse')
regression_tree.fit(X_train, Y_train)

# evaluation / prediction
regression_tree.score(X_test, Y_test)
prediction = regression_tree.predict(X_test)
print("$",(prediction - Y_test).abs().mean()*1000)

# K means - clustering
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
customers_scaled = scaler.fit_transform(customers[['Income', 'SpendingScore']])

from sklearn.cluster import KMeans
km = KMeans(n_clusters = 3, n_init = 25, random_state = 1234)

km.labels_
km.inertia_
km.cluster_centers_

# determining k
wcss=[]; wcss.append(km.inertia_)
silhouette=[]; silhouette.append(silhouette_score(customers_scaled, km.labels_))
calinski=[]; calinski.append(calinski_harabasz_score(customers_scaled, km.labels_))

DEEP LEARNING

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
### Example 1
#feature normalization (or scaling)
normalized_feature = keras.utils.normalize(X.values)

# Import train_test_split function from sklearn.model_selection
from sklearn.model_selection import train_test_split

# Split up the data into a training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)

# Build the Network
from tensorflow import keras
from keras.models import Sequential
#from tensorflow.keras.models import Sequential
from keras.layers import Dense

## Build Model (Building a three layer network - with one hidden layer)
model = Sequential()
model.add(Dense(4, input_dim=4, activation ='relu'))  
# You don't have to specify input size. Just define the hidden layers
model.add(Dense(3, activation='relu'))
model.add(Dense(1))

# Compile Model
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

#  Fit the Model
history = model.fit(X_train, y_train, validation_data = (X_test, y_test),
                    epochs = 32)

#inspect the model
model.summary()

model.evaluate(X_test, y_test)[1]

# predict SALES using the test data
test_predictions = model.predict(X_test).flatten()
 

NLP / Text Analysis

# to tokenize
text.split()

# to create counter
import collections
collections.Counter()

collections.Counter().most_common(10)

# to remove special characters
import re
re.sub(r'[^\w]',' ', text)

text.lower()

import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
for word in words:
    if word not in stop_words:
       words_no_stop.append(word)

from nltk.stem import PorterStemmer
for word in words_no_stop:
    words_clean.append(PorterStemer().stem(word))

from sklearn.feature_extraction.text import TfidfVectorizer
# Build a vocabulary from our training text and transform training text
training_dtm_tf = TfidfVectorizer(stop_words='english').fit_transform(training_text)

 

 

 

 

 

Print Friendly, PDF & Email