ML Lab Record
ML Lab Record
ML Lab Record
Gulshan Kumar
22PGMCA15
MCA 2nd
LINEAR REGRESSION
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
df=pd.read_csv("/content/drive/MyDrive/ML_DATASET/Copy of diabetes.csv)
df.head()
df.shape
df.isnull().sum()
x = df.drop(columns = 'BMI',axis = 1)
print(x)
x.shape
y=df['BMI']
print(standard_data)
x=standard_data
print(x)
x_train,x_tst,y_train,y_tst = train_test_split(x,y,test_size=0.2,random
_state=2)
print(x.shape,x_tst.shape,x_train.shape)
import numpy as np
class Linear_Regression():
# initiating the parameters (learning rate & no. of iterations)
def __init__(self, learning_rate, no_of_iterations):
self.learning_rate = learning_rate
self.no_of_iterations = no_of_iterations
def fit(self, X, Y ):
self.w = np.zeros(self.n)
self.b = 0
self.X = X #21(SALARY_DATA)
self.Y = Y #9
for i in range(self.no_of_iterations):
self.update_weights()
def update_weights(self):
Y_prediction = self.predict(self.X)
# calculate gradients
dw = - (2 * (self.X.T).dot(self.Y - Y_prediction)) / self.m
db = - 2 * np.sum(self.Y - Y_prediction)/self.m
# upadating the weights
self.w = self.w - self.learning_rate*dw
self.b = self.b - self.learning_rate*db
x_train,x_tst,y_train,y_tst = train_test_split(x,y,test_size=0.2,random
_state=2)
print(x.shape,x_tst.shape,x_train.shape)
model = Linear_Regression(learning_rate = 0.02, no_of_iterations=1000)
model.fit(x_train, y_train)
print('weight = ', model.w[0])
print('bias = ', model.b)
test_data_prediction = model.predict(x_tst)
print(test_data_prediction)
plt.scatter(x_tst, y_tst, color = 'red')
plt.plot(x_tst, test_data_prediction, color='blue')
plt.xlabel('Sympton')
plt.ylabel('Outcome')
plt.title(' Diabetes')
plt.show()
label encoder
import pandas as pd
import numpy as np
dfp=pd.read_csv('/content/drive/MyDrive/ML_DATASET/Copy of Placement_Da
taset.csv')
dfp.shape
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# df.head()
df.head()
# print(dfp.iloc[1,:])
Logistic Regression
import pandas as pd
import numpy as np
df=pd.read_csv('/content/drive/MyDrive/ML_DATASET/Copy of breast_cancer
_data.csv')
df.shape
df.isnull().sum()
df['Outcome'].values_count()
df.groupby('Outcome').mean()
from sklearn.preprocessing import LabelEncoder
l=LabelEncoder()
label=l.fit_transform(df.diagnosis)
df["outcome"]=label
df.head()
df=df.drop(columns="diagnosis")
x=df.drop(columns='outcome')
y=df['outcome']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =
train_test_split(x,y,test_size=0.3,random_state=10)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
model.predict(X_test)
model.score(X_test, y_test)
0.34502923976608185
model.coef_
model.intercept_
array([-7.40629666e-17])
model.predict(X_test)
PCA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
# creating dataframe
df = pd.DataFrame(cancer['data'], columns = cancer['feature_names'])
scalar = StandardScaler()
# fitting
scalar.fit(df)
scaled_data = scalar.transform(df)
# Importing PCA
from sklearn.decomposition import PCA
x_pca.shape
# giving a larger plot
plt.figure(figsize =(8, 6))
plt.scatter(x_pca[:, 0], x_pca[:, 1], c = cancer['target'], cmap ='plas
ma')
# plotting heatmap
sns.heatmap(df_comp)
OneHot Encoding
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("/content/drive/MyDrive/Copy of hometown.csv")
df
dummies = pd.get_dummies(df.town)
dummies
merged = pd.concat([df,dummies],axis='columns')
merged
final = merged.drop(['town'], axis='columns')
final
final = final.drop(['west windsor'], axis='columns')
final
X = final.drop('price', axis='columns')
X
y = final.price
df
X.head()
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
standard_data=scaler.fit_transform(X)
print(standard_data)
S=standard_data
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X,y)
model.predict(X)
model.score(X,y)
final
model.predict([[3400,0,0]])
model.predict([[2600,0,0]])
model.predict([[3600,0,1]])
model.predict([[2800,0,1]])
Using sklearn OneHotEncoder First step is to use label encoder to convert town names into numbers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dfle = df
dfle.town = le.fit_transform(dfle.town)
dfle
X = dfle[['town','area']].values
X
y = dfle.price
y
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
standard_data=scaler.fit_transform(X)
print(standard_data)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder = 'p
assthrough')
X = ct.fit_transform(X)
X
X = X[:,1:]
X
model.fit(X,y)
final
model.predict([[0,1,3400]])
SVM
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv("/content/drive/MyDrive/Copy of iris_data.csv")
df.head()
df.shape
df.info()
df.isnull().sum()
df.columns
from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
l=LabelEncoder()
label=l.fit_transform(df.Species)
df["outcome"]=label
df.head()
df[df.outcome==2].head()
df[df.outcome==1].head()
df[df.outcome==0].head()
KNN
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
df=pd.read_csv("/content/drive/MyDrive/Copy of iris_data.csv")
df.head()
df.info
df.isnull().sum()
df.columns
from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
l=LabelEncoder()
label=l.fit_transform(df.Species)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
df=pd.read_csv("/content/drive/MyDrive/Copy of iris_data.csv")
df.head()
df.info
df.isnull().sum()
df.columns
from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
l=LabelEncoder()
label=l.fit_transform(df.Species)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
df=pd.read_csv("/content/drive/MyDrive/Copy of iris_data.csv")
df.head()
df.info
df.isnull().sum()
df.columns
from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
l=LabelEncoder()
label=l.fit_transform(df.Species
Decision Tree
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree
Classifier
from sklearn.model_selection import train_test_split # Import train_tes
t_split function
from sklearn import metrics #Import scikit-learn metrics module for acc
uracy calculation
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report, confusion_matrix
df = pd.read_csv('/content/drive/MyDrive/Copy of diabetes.csv')
print(df.head())
X = df.drop('Outcome',axis = 1)
y = df.Outcome
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3
, random_state=1)
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
clf = DecisionTreeClassifier(criterion="entropy", max_depth=2)
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
plot_tree(clf)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
Random Forest
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix
df=pd.read_csv("heart.csv")
x = df.drop(columns="target",axis=1)
y = df.target
df.isnull().sum()
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2
5, random_state=2)
clf= RandomForestClassifier(n_estimators= 10, criterion="entropy")
clf=clf.fit(x_train, y_train)
y_pred=classifier.predict(x_test)
cm=confusion_matrix(y_test,y_pred)
cm
print("Accuracy:",metrics.accuracy_score(y_test,y_pred))
model.fit(x_train, y_train)
metrics.plot_confusion_matrix(model, x_test, y_test, display_labels=['N
egative', 'Positive'])
Clustering
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
df=pd.read_csv('/content/drive/MyDrive/Copy of iris_data.csv')
df.head()
from sklearn.preprocessing import LabelEncoder
l=LabelEncoder()
label=l.fit_transform(df.Species)
df["outcome"]=label
x=df.drop(["Id","Species","outcome"],axis=1)
y=df.outcome
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
x_train.shape,x_test.shape
wcss=[]
for i in range(1,11):
kmeans=kMEans(n_cluster=i)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
sns.set()
plt.plot(range(1,11),wcss)
plt.title('The elbow point graph')
plt.xlabel('no. of clusters')
plt.ylabel(.wcss)
plt.show()
Agglomerative
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
data = pd.read_csv("https://raw.githubusercontent.com/amankharwal/
Website-data/master/customers.csv")
print(data.head())
data["Income"] = data[["Annual Income (k$)"]]
data["Spending"] = data[["Spending Score (1-100)"]]
data = data[["Income", "Spending"]]
print(data.head())
data.shape
import scipy.cluster.hierarchy as Scn
from sklearn.cluster import AgglomerativeClustering as ag
dendrogram=Scn.dendrogram(Scn.linkage(data,method="complete"))
cluster =ag(n_cluster=3, linkage="complete")
from sklearn.cluster import AgglomerativeClustering
model = AgglomerativeClustering()
model.fit(data)
pred = model.fit_predict(data)
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(8, 8))
plt.scatter(data["Income"], data["Spending"], c=pred, cmap='rainbow', a
lpha=0.9)
plt.show()
data.shape
DBSCAN
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.metrics import rand_score
X, y_true = make_blobs(n_samples=300, centers=4,cluster_std=0.50, rando
m_state=0)
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print(labels)
unique_labels = set(labels)
colors = ['y', 'b', 'g', 'r']
print(colors)
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = 'k'
class_member_mask = (labels == k)
df1['Species']= label_encoder.fit_transform(df['Species'])
df1['Species'].unique()
df1['Species']
x=df1.drop(columns=['Species','Id'])
print(x)
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
standard_data=scaler.fit_transform(x)
print(standard_data)
X=standard_data
print(X)
y=df['Species']
from sklearn.model_selection import train_test_split
#spliting the data into training and testing
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,rand
om_state=2)
print(X.shape,x_test.shape,x_train.shape)
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
Model1 = AdaBoostRegressor(n_estimators=100,learning_rate=0.1,random_st
ate=2)
Model2=GradientBoostingRegressor(n_estimators=50,learning_rate=0.1,rand
om_state=2)
#fit adaboost regressor to training data
Model1.fit(x_train,y_train)
Model2.fit(x_train,y_train)
y_pred1=Model1.predict(x_test)
print(y_pred1)
y_pred2=Model2.predict(x_test)
print(y_pred2)
print(y_test)
score=Model1.score(x_train,y_train)
score
score1=Model2.score(x_train,y_train)
score1
df['Species']= label_encoder.fit_transform(df['Species'])
df['Species'].unique()
df['Species']
df.head()
x=df.drop(columns=['Species','Id'])
print(x)
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
standard_data=scaler.fit_transform(x)
print(standard_data)
X=standard_data
print(X)
y=df['Species']
y
y.head()
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
#spliting the data into training and testing
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,rand
om_state=2)
print(X.shape,x_test.shape,x_train.shape)
model1 = AdaBoostClassifier(n_estimators=100,learning_rate=0.1,random_s
tate=2)
model2=GradientBoostingClassifier(n_estimators=50,learning_rate=0.1,ran
dom_state=2)
#fit adaboost regressor to training data
model1.fit(x_train,y_train)
model2.fit(x_train,y_train)
y_pred1=model1.predict(x_test)
print(y_pred1)
y_pred2=model2.predict(x_test)
print(y_pred2)
print(y_test)
y_test
#confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred1)
cm
cm1 = metrics.confusion_matrix(y_test, y_pred2)
cm1
score=model1.score(x_train,X_test)
score=model2.score(x_train,X_test)