ML PROGRAMS
ML PPT LINK
ML LAB PROGRAMS
EXP1: To compute and understand the basic measures of Central Tendency (Mean, Median, Mode) and Dispersion (Variance, Standard Deviation) using Python
Program:
import numpy as np # Library for numerical computing(used for numerical operations and missing values)
from scipy import stats # Library for statistical analysis
data = [85, 92, 78, 85, 95, 88, 72, 85, 90, 80] # Dataset
data_array = np.array(data) # Converting lists to NumPy array
mean_value = np.mean(data_array) # Average
median_value = np.median(data_array) # If kept in order middle value
mode_result = stats.mode(data_array) # stats.mode() will give repeated number
print(f"Mean: {mean_value:.2f}") # :.2f (decimal point)
print(f"Median: {median_value:.2f}")
print(f"Mode: {mode_result.mode[0]}")
variance_value = np.var(data_array, ddof=1) # Sample variance
std_dev_value = np.std(data_array, ddof=1) # Sample standard deviation
print(f"Variance: {variance_value:.2f}")
print(f"Standard Deviation: {std_dev_value:.2f}")
EXP 2 : To apply fundamental data preprocessing techniques Attribute selection, Handling Missing Values, Discretization, Elimination of Outliers using Python libraries.
import pandas as pd #used to work with tables (DataFrames)
import numpy as np #used for numerical operations (like np.nan)
from sklearn.preprocessing import KBinsDiscretizer #converts continuous values into groups (bins)
data = { #Creating the Dataset
'Age': [25, 30, 35, 40, 22, 60],
'Salary': [50000, 60000, np.nan, 120000, 30000, 1000000],
'Experience': [2, 5, 8, 15, 1, 40],
'Department': ['IT', 'HR', 'IT', 'Finance', 'HR', 'IT']
}
df = pd.DataFrame(data) #Creating a DataFrame
print("Original Dataset:\n", df)
df_selected = df[['Age', 'Salary', 'Experience']].copy() #.copy() avoids changing the original DataFrame
df_selected['Salary'] = df_selected['Salary'].fillna(df_selected['Salary'].mean())
discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform') #Divides Age into 3 equal ranges (bins)
df_selected['Age_Bin'] = discretizer.fit_transform(df_selected[['Age']]) #Converts ages into numbers:0 → young, 1 → middle- aged, 2 → older .
Q1 = df_selected['Salary'].quantile(0.25)
Q3 = df_selected['Salary'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_no_outliers = df_selected[ # Removing Outliers
(df_selected['Salary'] >= lower_bound) &
(df_selected['Salary'] <= upper_bound)
]
print("\nAfter Eliminating Outliers:\n", df_no_outliers)
EXP 3: To apply the K-Nearest Neighbors (KNN) algorithm for both classification and regression tasks using Python.
import numpy as np # For numerical operations
import matplotlib.pyplot as plt # For creating the scatter plot graph.
from sklearn.datasets import make_regression # for generating data, splitting it, creating the model, and calculating error metrics.
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
X, y = make_regression(n_samples=200, n_features=1, noise=0.1, random_state=42) # Generate synthetic dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Split the dataset into training and testing sets
knn_regressor = KNeighborsRegressor(n_neighbors=5) # Create and train the KNN regressor
knn_regressor.fit(X_train, y_train)
y_pred = knn_regressor.predict(X_test) # Make predictions on the test data
mse = mean_squared_error(y_test, y_pred) # Evaluate the model
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
plt.scatter(X_test, y_test, color='blue', label='Actual’) # Visualize the results
plt.scatter(X_test, y_pred, color='red', label='Predicted')
plt.title('KNN Regression')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.legend()
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris # Load the Iris dataset
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Step 1: Load the dataset iris = load_iris()
X = iris.data # Features (4 columns: sepal/petal length/width)
y = iris.target # Target labels (0: Setosa, 1: Versicolor, 2: Virginica)
# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 80% training (120 samples), 20% testing (30 samples)
# Step 3: Create and train the KNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5) # Use 5 nearest neighbors
knn_classifier.fit(X_train, y_train)
# Step 4: Make predictions on the test data
y_pred = knn_classifier.predict(X_test)
# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('\nClassification Report:')
print(classification_report(y_test, y_pred, target_names=iris.target_names))
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))
# Step 6 : Predict a new input sample # Example input: [sepal length, sepal width, petal length, petal width]
input_sample = np.array([[5.1, 3.5, 1.4, 0.2]]) # Change values as needed
# Predict class
predicted_class = knn_classifier.predict(input_sample)
# Predict probabilities (optional)
predicted_prob = knn_classifier.predict_proba(input_sample)
print("\nInput Sample:", input_sample)
print("Predicted Flower Name:", iris.target_names[predicted_class[0]])
Exp 4: Demonstrate decision tree algorithm for a classification problem and perform parameter tuning for better results
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
data = load_iris() # Load dataset
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Split data
clf = DecisionTreeClassifier() # Default Decision Tree
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Default Accuracy:", accuracy_score(y_test, y_pred))
params = { # Parameter Tuning using Grid Search
'criterion': ['gini', 'entropy'],
'max_depth': [2, 3, 4, 5, None],
'min_samples_split': [2, 3, 4],
'min_samples_leaf': [1, 2, 3]
}
grid = GridSearchCV(DecisionTreeClassifier(), params,cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
best_model = grid.best_estimator_ # Best Model
y_best = best_model.predict(X_test)
print("Best Params:", grid.best_params_)
print("Tuned Accuracy:", accuracy_score(y_test, y_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_best))
print("\nClassification Report:\n", classification_report(y_test, y_best))
Exp 6a: Apply Random Forest algorithm for classification
# Import required libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Load dataset
iris = load_iris()
X = iris.data
y = iris.target
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create Random Forest Classifier model
rf_classifier = RandomForestClassifier(
n_estimators=100,
criterion="gini",
random_state=42
)
# Train the model
rf_classifier.fit(X_train, y_train)
# Make predictions
y_pred = rf_classifier.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Exp 6b: Apply Random Forest algorithm for regression
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Generate synthetic regression data
np.random.seed(42)
X = np.linspace(0, 10, 200).reshape(-1, 1)
y = 3 * X.squeeze()**2 + 5 * X.squeeze() + 10 + np.random.randn(200) * 10
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Create Random Forest Regressor
rf_model = RandomForestRegressor(
n_estimators=200,
max_depth=10,
min_samples_split=5,
random_state=42
)
# Train the model
rf_model.fit(X_train, y_train)
# Predict on test data
y_pred = rf_model.predict(X_test)
# Model evaluation
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))
# Sort values for smooth plotting
sorted_idx = X_test.squeeze().argsort()
# Plot actual vs predicted values
plt.figure()
plt.scatter(X_test, y_test)
plt.plot(X_test[sorted_idx], y_pred[sorted_idx])
plt.xlabel("Input Feature (X)")
plt.ylabel("Target Value (y)")
plt.title("Random Forest Regression")
plt.show()
Exp 7: Demonstrate Naïve Bayes Classification algorithm.
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Load the Iris dataset
data = load_iris()
X = data.data # Features
y = data.target # Target labels
# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize the Gaussian Naïve Bayes classifier
nb_classifier = GaussianNB()
# Train the model
nb_classifier.fit(X_train, y_train)
# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=data.target_names))
# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
Exp 8: Apply Support Vector algorithm for classification.
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Load the Iris dataset
data = load_iris()
X = data.data # Features
y = data.target # Target labels
# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize the SVM classifier
# Using 'linear' kernel for simplicity (you can try 'rbf', 'poly', 'sigmoid')
svm_classifier = SVC(kernel='linear', random_state=42)
# Train the model
svm_classifier.fit(X_train, y_train)
# Make predictions on the test set
y_pred = svm_classifier.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=data.target_names))
# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
Exp 9: Demonstrate simple linear regression algorithm for a regression problem.
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])
lreg=LinearRegression()
lreg.fit(x.reshape(-1, 1), y)
plt.scatter(x, y)
plt.plot(x, lreg.predict(x.reshape(-1, 1)), color='red')
plt.title("Linear Regression")
plt.xlabel("x")
plt.ylabel("y")
plt.show()
Exp 10: Apply Logistic regression algorithm for a classification problem.
import numpy
X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69,
5.88]).reshape(-1,1)
print(X)
y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
from sklearn import linear_model
logr = linear_model.LogisticRegression()
logr.fit(X,y)
predicted = logr.predict(numpy.array([3.46]).reshape(-1,1))
print(predicted)
logr = linear_model.LogisticRegression()
logr.fit(X,y)
log_odds = logr.coef_
odds = numpy.exp(log_odds)
print(odds)
Exp 11: Demonstrate Multi-layer Perceptron algorithm for a classification problem.
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, classification_report,
confusion_matrix, ConfusionMatrixDisplay)
import matplotlib.pyplot as plt
# Load the Iris dataset
data = load_iris()
X = data.data # Features
y = data.target # Target labels
feature_names = data.feature_names
class_names = data.target_names
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Initialize the MLP classifier
mlp = MLPClassifier(
hidden_layer_sizes=(100, 50), # Two hidden layers with 100 and 50 neurons
activation='relu', # Rectified Linear Unit activation
solver='adam', # Optimization algorithm
alpha=0.0001, # L2 penalty (regularization term) parameter
batch_size='auto', # Size of minibatches
learning_rate='constant', # Learning rate schedule
learning_rate_init=0.001, # Initial learning rate
max_iter=500, # Maximum number of iterations
random_state=42, # Random seed
early_stopping=True, # Use early stopping to terminate training when validation score stops improving
validation_fraction=0.1 # Fraction of training data to set aside as validation set
)
# Train the model
mlp.fit(X_train, y_train)
# Make predictions
y_pred = mlp.predict(X_test)
y_pred_prob = mlp.predict_proba(X_test)
# Evaluate the model
print(f"Training set score: {mlp.score(X_train, y_train):.3f}")
print(f"Test set score: {mlp.score(X_test, y_test):.3f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=class_names))
# Plot confusion matrix
Exp 12: Implement the K-means algorithm and apply it to the data you selected. Evaluate performance by measuring the sum of the Euclidean distance of each example from its class center. Test the performance of the algorithm as a function of the parameters K.
import numpy as np
from sklearn.datasets import load_iris
X = load_iris().data # selected data
def kmeans(X, K, i=10):
c = X[np.random.choice(len(X), K, 0)]
for _ in range(i):
l = ((X[:,None]-c)**2).sum(2).argmin(1)
c = np.array([X[l==k].mean(0) for k in range(K)])
return l, c
def sse(X, l, c):
return sum(np.linalg.norm(x-c[k]) for x,k in zip(X,l))
for K in range(1,6):
l,c = kmeans(X,K)
print(K, sse(X,l,c))
Exp 13: Demonstrate the use of Fuzzy C-Means Clustering.
import numpy as np
from sklearn.datasets import load_iris
data = load_iris()
X = data.data # 1. Load data (first 4 features), transpose
K, m, e, i = 3, 2, 1e-5, 10 # 2. clusters, fuzziness, error, iterations
c = X[:, np.random.choice(X.shape[1], K, 0)] # 3. Randomly init cluster centers
for _ in range(i): # 4. Repeat update steps
d = np.linalg.norm(X[:, :, None]-c[:, None], axis=0) + 1e-9 # 5. Distance matrix
u = 1 / d**(2/(m-1)) # 6. Compute membership (unnormalized)
u = u / u.sum(1, keepdims=1) # 7. Normalize memberships
c = (X @ (u**m)) / (u**m).sum(0) # 8. Update cluster centers
labels = u.argmax(1) # 9. Final cluster labels
print(labels, sum(d[n, labels[n]] for n in range(len(labels)))) #10. Labels + performance
Exp 14: Demonstrate the use of Expectation Maximization based clustering algorithm.
import numpy as np
from sklearn.datasets import load_iris
X = load_iris().data # Load dataset (150 flowers, 4 features)
K = 3 # We want 3 clusters (groups)
n, d = X.shape # n=samples, d=features
p = np.ones((n, K))/K # Start: each point has equal chance in each cluster
m = X[np.random.choice(n, K, 0)] # Pick random points as initial cluster centers
s = np.array([np.eye(d)]*K) # Start: clusters are assumed circular (identity matrix)
for _ in range(10): # Repeat 10 times to improve clusters
for k in range(K): # ----- E STEP (Expectation) -----
diff = X - m[k] # Distance of all points from cluster k mean
p[:,k] = np.exp(-.5*np.sum(diff@np.linalg.inv(s[k])*diff,1))/np.sqrt(np.linalg.det(s[k]))
p /= p.sum(1, keepdims=True) # Convert to real probability (sum of each row = 1)
for k in range(K): # ----- M STEP (Maximization) -----
w = p[:,k] # Weight = how much each point belongs to cluster k
m[k] = (w@X)/w.sum() # Update cluster center (mean)
s[k] = ((X-m[k]).T*(w/w.sum()))@(X-m[k]) # Update cluster shape/spread
print("Cluster labels:", p.argmax(1)) # Final group = cluster with highest probability

Comments
Post a Comment