This repository contains helper scripts. - Author: Mert Cobanoglu
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--isim","-i")
parser.add_argument("--soyisim","-s")
parser.add_argument("--no","-n")
veri = parser.parse_args()
print("isim {}".format(veri.isim))
print("soyisim {}".format(veri.soyisim))
print("no {}".format(veri.no))
path = r"C:\Users\path"
filenames = os.listdir(path)
for i in filenames:
dirs = os.path.join(path, i)
print(dirs)
import glob, os
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(".ipynb"):
print(os.path.join(root, file))
import pickle
favorite_color = { "lion": "yellow", "kitty": "red" }
pickle.dump( favorite_color, open( "save.p", "wb" ) )
favorite_color = pickle.load( open( "save.p", "rb" ) )
import datetime
hours_before = datetime.datetime.now() - datetime.timedelta(hours=2)
print(f"Current Time: {datetime.datetime.now().timestamp()}")
print(f"2 Hours Before: {hours_before.timestamp()}")
import logging
logging.basicConfig(filename='test.log', level=logging.DEBUG,
format='%(asctime)s:%(levelname)s:%(message)s')
def add(x, y):
"""Add Function"""
return x
num_1 = 20
num_2 = 10
add_result = add(num_1, num_2)
logging.debug('Add: {} {} = {}'.format(num_1, num_2, add_result))
python -m venv [directory] #Create venv
myvenv/bin/activate.bin #activate, for windows just click
pip install simplejson # regular installing via python
pip install --upgrade pip # this is also commonly known
pip freeze > requirements.txt # this is best :), for venv it creates requ.txt
pip install -r requirements.txt # easy way to install all dependencies
deactivate # deactivate env :)
git reset --hard <old-commit-id>
git push -f <remote-name> <branch-name>
# not recommended working with collaborative environment
import pandas as pd
import seaborn as sns
corr = d.corr()
sns.heatmap(corr)
nan_percentage = raw_data.isna().sum() * 100 / len(raw_data)
missing_percentage_df = pd.DataFrame({'column_name': raw_data.columns, 'percent_missing': nan_percentage}).reset_index(drop=True)
percentage_threshold = 20 #define percentage to filter
missing_percentage_df[missing_percentage_df["percent_missing"] < percentage_threshold]
import pandas as pd
df = pd.read_csv("diabetes.csv")
markdown = df.to_markdown()
text_file = open("sample.txt", "w")
text_file.write(markdown)
text_file.close()
from sklearn.datasets import load_iris
from sklearn.preprocessing import LabelEncoder
import pandas as pd
cols = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
data = pd.read_csv("iris.data", names=cols)
#Label Encoding
label_encoder = LabelEncoder()
targets = label_encoder.fit_transform(data["class"])
#One Hot Encoding
from sklearn.preprocessing import OneHotEncoder
oh_encoder = OneHotEncoder(sparse=False)
targets = targets.reshape(150, 1)
oneho = oh_encoder.fit_transform(targets)
for cols in data.columns:
data[cols] = label_encoder.fit_transform(data[cols])
# Select the object (string) columns
mask = data.dtypes == np.object
categorical_cols = data.columns[mask]
num_ohc_cols = (data[categorical_cols]
.apply(lambda x: x.nunique())
.sort_values(ascending=False))
# No need to encode if there is only one value
small_num_ohc_cols = num_ohc_cols.loc[num_ohc_cols>1]
# Number of one-hot columns is one less than the number of categories
small_num_ohc_cols -= 1
# This is 215 columns, assuming the original ones are dropped.
# This is quite a few extra columns!
small_num_ohc_cols.sum()
# Model Kütüphaneleri
from sklearn.metrics import accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier
# Bildirim Kütüphaneleri
from win10toast import ToastNotifier
import time
# # Toplam süreyi hesaplamak ve bunu bildirimde görmek iyi olabilir.
start = time.process_time()
model = RandomForestClassifier(n_estimators=700).fit(X_train, y_train)
duration = time.process_time() - start
# # Model tahminlerini alalım
preds = model.predict(X_test)
# # Metriklerimizi alalım
acc = accuracy_score(y_test, preds))
prec = (precision_score(y_test, preds))
# Bildirim objemizi oluşturalım
toaster = ToastNotifier()
toaster.show_toast("Eğitim bitti",
f"{acc}, {model_precision}, Süre: {duration}",
icon_path=None,
duration=5,
threaded=True)
for name in data.columns[:20]: #Limit columns to plot on data
plt.figure(figsize=(30,10)) #Change figure size
sns.scatterplot(x=data[name], y=range(0, data[name].shape[0])) #Make scatter plots
plt.show() #Show every plot on every iterations in order to not to wait for all
import xgboost as xgboost
import pandas as pd
churn_data = pd.read_csv("classification_data.csv")
churn_dmatrix = xgb.DMatrix(data=churn_data.iloc[:, -1],
label=churn_data.month_5_still_here)
params = {"objective":"binary:logistic", max_depth=4}
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=4,
num_boost_round=10, metrics="error", as_pandas=True)
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score
best_preds = np.asarray([np.argmax(line) for line in preds])
print("Precision = {}".format(precision_score(y_test, best_preds, average='macro')))
print("Recall = {}".format(recall_score(y_test, best_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, best_preds)))
from sklearnmetrics import classification_report
report = classification_report(y_test, best_preds)
print(report)
def dact_dist(dataset, high_corrs, class_col):
"""
:dataset: pandas dataframe
:values: columns to visualize
:class_col: classes
"""
labels = dataset[class_col].value_counts().index.to_list()
for col_name in high_corrs:
fig, ax = plt.subplots(figsize=(30,10))
for label in labels:
sns.distplot(dataset[col_name][dataset[class_col]==label], ax=ax)
ax.legend(labels)
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
train = read_csv("./train.csv")
def correlation_heatmap(train):
correlations = train.corr()
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f',
square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70})
plt.show();
correlation_heatmap(train)
categories = ["A", "B", "C"]
plt.figure(figsize=(30,5))
for cat in categories:
g = sns.kdeplot(data_70[data['Feat1']==cat]["Feat2"],shade=True, bw=.01)
g.set_xlim(59,65)
barplot = data.groupby(by=["Durum"])[st60_parameters].agg(["mean", "std" ,"median"]).T
f, axes = plt.subplots(int(barplot.shape[0]/barplot.shape[1]), barplot.shape[1], figsize=(20, barplot.shape[0]*2))
counter=0
for i in range(int(barplot.shape[0]/barplot.shape[1])):
for y in range(barplot.shape[1]):
g = sns.barplot(x=barplot.iloc[counter].index,
y=barplot.iloc[counter].values,
hue=barplot.iloc[counter].index,
ax=axes[i,y],
palette="Set1")
g.set_title(barplot.iloc[counter].name)
counter = 1
Mert Cobanoglu - Linkedin - [email protected]