ML PROGRAMS

March 28, 2026

ML PROGRAMS

EXP1: To compute and understand the basic measures of Central Tendency (Mean, Median, Mode) and Dispersion (Variance, Standard Deviation) using Python

Program:

import numpy as np # Library for numerical computing(used for numerical operations and missing values)

from scipy import stats # Library for statistical analysis

data = [85, 92, 78, 85, 95, 88, 72, 85, 90, 80] # Dataset

data_array = np.array(data) # Converting lists to NumPy array

mean_value = np.mean(data_array) # Average

median_value = np.median(data_array) # If kept in order middle value

mode_result = stats.mode(data_array) # stats.mode() will give repeated number

print(f"Mean: {mean_value:.2f}") # :.2f (decimal point)

print(f"Median: {median_value:.2f}")

print(f"Mode: {mode_result.mode[0]}")

variance_value = np.var(data_array, ddof=1) # Sample variance

std_dev_value = np.std(data_array, ddof=1) # Sample standard deviation

print(f"Variance: {variance_value:.2f}")

print(f"Standard Deviation: {std_dev_value:.2f}")

EXP 2 : To apply fundamental data preprocessing techniques Attribute selection, Handling Missing Values, Discretization, Elimination of Outliers using Python libraries.

import pandas as pd #used to work with tables (DataFrames)

import numpy as np #used for numerical operations (like np.nan)

from sklearn.preprocessing import KBinsDiscretizer #converts continuous values into groups (bins)

data = { #Creating the Dataset

'Age': [25, 30, 35, 40, 22, 60],

'Salary': [50000, 60000, np.nan, 120000, 30000, 1000000],

'Experience': [2, 5, 8, 15, 1, 40],

'Department': ['IT', 'HR', 'IT', 'Finance', 'HR', 'IT']

}

df = pd.DataFrame(data) #Creating a DataFrame

print("Original Dataset:\n", df)

df_selected = df[['Age', 'Salary', 'Experience']].copy() #.copy() avoids changing the original DataFrame

df_selected['Salary'] = df_selected['Salary'].fillna(df_selected['Salary'].mean())

discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform') #Divides Age into 3 equal ranges (bins)

df_selected['Age_Bin'] = discretizer.fit_transform(df_selected[['Age']]) #Converts ages into numbers:0 → young, 1 → middle- aged, 2 → older .

Q1 = df_selected['Salary'].quantile(0.25)

Q3 = df_selected['Salary'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR

upper_bound = Q3 + 1.5 * IQR

df_no_outliers = df_selected[ # Removing Outliers

(df_selected['Salary'] >= lower_bound) &

(df_selected['Salary'] <= upper_bound)

]

print("\nAfter Eliminating Outliers:\n", df_no_outliers)

EXP 3: To apply the K-Nearest Neighbors (KNN) algorithm for both classification and regression tasks using Python.

import numpy as np # For numerical operations

import matplotlib.pyplot as plt # For creating the scatter plot graph.

from sklearn.datasets import make_regression # for generating data, splitting it, creating the model, and calculating error metrics.

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error, r2_score

X, y = make_regression(n_samples=200, n_features=1, noise=0.1, random_state=42) # Generate synthetic dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Split the dataset into training and testing sets

knn_regressor = KNeighborsRegressor(n_neighbors=5) # Create and train the KNN regressor

knn_regressor.fit(X_train, y_train)

y_pred = knn_regressor.predict(X_test) # Make predictions on the test data

mse = mean_squared_error(y_test, y_pred) # Evaluate the model

r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')

print(f'R-squared: {r2}')

plt.scatter(X_test, y_test, color='blue', label='Actual’) # Visualize the results

plt.scatter(X_test, y_pred, color='red', label='Predicted')

plt.title('KNN Regression')

plt.xlabel('Feature')

plt.ylabel('Target')

plt.legend()

plt.show()

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import load_iris # Load the Iris dataset

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Load the dataset iris = load_iris()

X = iris.data # Features (4 columns: sepal/petal length/width)

y = iris.target # Target labels (0: Setosa, 1: Versicolor, 2: Virginica)

# Step 2: Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 80% training (120 samples), 20% testing (30 samples)

# Step 3: Create and train the KNN classifier

knn_classifier = KNeighborsClassifier(n_neighbors=5) # Use 5 nearest neighbors

knn_classifier.fit(X_train, y_train)

# Step 4: Make predictions on the test data

y_pred = knn_classifier.predict(X_test)

# Step 5: Evaluate the model

accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')

print('\nClassification Report:')

print(classification_report(y_test, y_pred, target_names=iris.target_names))

print('\nConfusion Matrix:')

print(confusion_matrix(y_test, y_pred))

# Step 6 : Predict a new input sample # Example input: [sepal length, sepal width, petal length, petal width]

input_sample = np.array([[5.1, 3.5, 1.4, 0.2]]) # Change values as needed

# Predict class

predicted_class = knn_classifier.predict(input_sample)

# Predict probabilities (optional)

predicted_prob = knn_classifier.predict_proba(input_sample)

print("\nInput Sample:", input_sample)

print("Predicted Flower Name:", iris.target_names[predicted_class[0]])

Exp 4: Demonstrate decision tree algorithm for a classification problem and perform parameter tuning for better results

import pandas as pd

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

data = load_iris() # Load dataset

X = pd.DataFrame(data.data, columns=data.feature_names)

y = pd.Series(data.target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Split data

clf = DecisionTreeClassifier() # Default Decision Tree

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Default Accuracy:", accuracy_score(y_test, y_pred))

params = { # Parameter Tuning using Grid Search

'criterion': ['gini', 'entropy'],

'max_depth': [2, 3, 4, 5, None],

'min_samples_split': [2, 3, 4],

'min_samples_leaf': [1, 2, 3]

}

grid = GridSearchCV(DecisionTreeClassifier(), params,cv=5, scoring='accuracy')

grid.fit(X_train, y_train)

best_model = grid.best_estimator_ # Best Model

y_best = best_model.predict(X_test)

print("Best Params:", grid.best_params_)

print("Tuned Accuracy:", accuracy_score(y_test, y_best))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_best))

print("\nClassification Report:\n", classification_report(y_test, y_best))

Exp 6a: Apply Random Forest algorithm for classification

# Import required libraries

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset

iris = load_iris()

X = iris.data

y = iris.target

# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Random Forest Classifier model

rf_classifier = RandomForestClassifier(

n_estimators=100,

criterion="gini",

random_state=42

)

# Train the model

rf_classifier.fit(X_train, y_train)

# Make predictions

y_pred = rf_classifier.predict(X_test)

# Evaluate the model

print("Accuracy:", accuracy_score(y_test, y_pred))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

print("\nClassification Report:\n", classification_report(y_test, y_pred))

Exp 6b: Apply Random Forest algorithm for regression

# Import libraries

import numpy as np

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score

# Generate synthetic regression data

np.random.seed(42)

X = np.linspace(0, 10, 200).reshape(-1, 1)

y = 3 * X.squeeze()**2 + 5 * X.squeeze() + 10 + np.random.randn(200) * 10

# Split the dataset

X_train, X_test, y_train, y_test = train_test_split(

X, y, test_size=0.2, random_state=42

)

# Create Random Forest Regressor

rf_model = RandomForestRegressor(

n_estimators=200,

max_depth=10,

min_samples_split=5,

random_state=42

)

# Train the model

rf_model.fit(X_train, y_train)

# Predict on test data

y_pred = rf_model.predict(X_test)

# Model evaluation

print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

print("R2 Score:", r2_score(y_test, y_pred))

# Sort values for smooth plotting

sorted_idx = X_test.squeeze().argsort()

# Plot actual vs predicted values

plt.figure()

plt.scatter(X_test, y_test)

plt.plot(X_test[sorted_idx], y_pred[sorted_idx])

plt.xlabel("Input Feature (X)")

plt.ylabel("Target Value (y)")

plt.title("Random Forest Regression")

plt.show()

Exp 7: Demonstrate Naïve Bayes Classification algorithm.

# Import necessary libraries

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the Iris dataset

data = load_iris()

X = data.data # Features

y = data.target # Target labels

# Split the data into training and testing sets (70% train, 30% test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Gaussian Naïve Bayes classifier

nb_classifier = GaussianNB()

# Train the model

nb_classifier.fit(X_train, y_train)

# Make predictions on the test set

y_pred = nb_classifier.predict(X_test)

# Evaluate the model

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

# Classification Report

print("\nClassification Report:")

print(classification_report(y_test, y_pred, target_names=data.target_names))

# Confusion Matrix

print("Confusion Matrix:")

print(confusion_matrix(y_test, y_pred))

Exp 8: Apply Support Vector algorithm for classification.

# Import necessary libraries

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the Iris dataset

data = load_iris()

X = data.data # Features

y = data.target # Target labels

# Split the data into training and testing sets (70% train, 30% test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the SVM classifier

# Using 'linear' kernel for simplicity (you can try 'rbf', 'poly', 'sigmoid')

svm_classifier = SVC(kernel='linear', random_state=42)

# Train the model

svm_classifier.fit(X_train, y_train)

# Make predictions on the test set

y_pred = svm_classifier.predict(X_test)

# Evaluate the model

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

# Classification Report

print("\nClassification Report:")

print(classification_report(y_test, y_pred, target_names=data.target_names))

# Confusion Matrix

print("Confusion Matrix:")

print(confusion_matrix(y_test, y_pred))

Exp 9: Demonstrate simple linear regression algorithm for a regression problem.

from sklearn.linear_model import LinearRegression

import numpy as np

import matplotlib.pyplot as plt

x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])

lreg=LinearRegression()

lreg.fit(x.reshape(-1, 1), y)

plt.scatter(x, y)

plt.plot(x, lreg.predict(x.reshape(-1, 1)), color='red')

plt.title("Linear Regression")

plt.xlabel("x")

plt.ylabel("y")

plt.show()

Exp 10: Apply Logistic regression algorithm for a classification problem.

import numpy

X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69,

5.88]).reshape(-1,1)

print(X)

y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

from sklearn import linear_model

logr = linear_model.LogisticRegression()

logr.fit(X,y)

predicted = logr.predict(numpy.array([3.46]).reshape(-1,1))

print(predicted)

logr = linear_model.LogisticRegression()

logr.fit(X,y)

log_odds = logr.coef_

odds = numpy.exp(log_odds)

print(odds)

Exp 11: Demonstrate Multi-layer Perceptron algorithm for a classification problem.

# Import necessary libraries

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import (accuracy_score, classification_report,

confusion_matrix, ConfusionMatrixDisplay)

import matplotlib.pyplot as plt

# Load the Iris dataset

data = load_iris()

X = data.data # Features

y = data.target # Target labels

feature_names = data.feature_names

class_names = data.target_names

# Split the data into training and testing sets (80% train, 20% test)

X_train, X_test, y_train, y_test = train_test_split(

X, y, test_size=0.2, random_state=42, stratify=y

)

# Standardize features by removing the mean and scaling to unit variance

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

# Initialize the MLP classifier

mlp = MLPClassifier(

hidden_layer_sizes=(100, 50), # Two hidden layers with 100 and 50 neurons

activation='relu', # Rectified Linear Unit activation

solver='adam', # Optimization algorithm

alpha=0.0001, # L2 penalty (regularization term) parameter

batch_size='auto', # Size of minibatches

learning_rate='constant', # Learning rate schedule

learning_rate_init=0.001, # Initial learning rate

max_iter=500, # Maximum number of iterations

random_state=42, # Random seed

early_stopping=True, # Use early stopping to terminate training when validation score stops improving

validation_fraction=0.1 # Fraction of training data to set aside as validation set

)

# Train the model

mlp.fit(X_train, y_train)

# Make predictions

y_pred = mlp.predict(X_test)

y_pred_prob = mlp.predict_proba(X_test)

# Evaluate the model

print(f"Training set score: {mlp.score(X_train, y_train):.3f}")

print(f"Test set score: {mlp.score(X_test, y_test):.3f}\n")

print("Classification Report:")

print(classification_report(y_test, y_pred, target_names=class_names))

# Plot confusion matrix

Exp 12: Implement the K-means algorithm and apply it to the data you selected. Evaluate performance by measuring the sum of the Euclidean distance of each example from its class center. Test the performance of the algorithm as a function of the parameters K.

import numpy as np

from sklearn.datasets import load_iris

X = load_iris().data # selected data

def kmeans(X, K, i=10):

c = X[np.random.choice(len(X), K, 0)]

for _ in range(i):

l = ((X[:,None]-c)**2).sum(2).argmin(1)

c = np.array([X[l==k].mean(0) for k in range(K)])

return l, c

def sse(X, l, c):

return sum(np.linalg.norm(x-c[k]) for x,k in zip(X,l))

for K in range(1,6):

l,c = kmeans(X,K)

print(K, sse(X,l,c))

Exp 13: Demonstrate the use of Fuzzy C-Means Clustering.

import numpy as np

from sklearn.datasets import load_iris

data = load_iris()

X = data.data # 1. Load data (first 4 features), transpose

K, m, e, i = 3, 2, 1e-5, 10 # 2. clusters, fuzziness, error, iterations

c = X[:, np.random.choice(X.shape[1], K, 0)] # 3. Randomly init cluster centers

for _ in range(i): # 4. Repeat update steps

d = np.linalg.norm(X[:, :, None]-c[:, None], axis=0) + 1e-9 # 5. Distance matrix

u = 1 / d**(2/(m-1)) # 6. Compute membership (unnormalized)

u = u / u.sum(1, keepdims=1) # 7. Normalize memberships

c = (X @ (u**m)) / (u**m).sum(0) # 8. Update cluster centers

labels = u.argmax(1) # 9. Final cluster labels

print(labels, sum(d[n, labels[n]] for n in range(len(labels)))) #10. Labels + performance

Exp 14: Demonstrate the use of Expectation Maximization based clustering algorithm.

import numpy as np

from sklearn.datasets import load_iris

X = load_iris().data # Load dataset (150 flowers, 4 features)

K = 3 # We want 3 clusters (groups)

n, d = X.shape # n=samples, d=features

p = np.ones((n, K))/K # Start: each point has equal chance in each cluster

m = X[np.random.choice(n, K, 0)] # Pick random points as initial cluster centers

s = np.array([np.eye(d)]*K) # Start: clusters are assumed circular (identity matrix)

for _ in range(10): # Repeat 10 times to improve clusters

for k in range(K): # ----- E STEP (Expectation) -----

diff = X - m[k] # Distance of all points from cluster k mean

p[:,k] = np.exp(-.5*np.sum(diff@np.linalg.inv(s[k])*diff,1))/np.sqrt(np.linalg.det(s[k]))

p /= p.sum(1, keepdims=True) # Convert to real probability (sum of each row = 1)

for k in range(K): # ----- M STEP (Maximization) -----

w = p[:,k] # Weight = how much each point belongs to cluster k

m[k] = (w@X)/w.sum() # Update cluster center (mean)

s[k] = ((X-m[k]).T*(w/w.sum()))@(X-m[k]) # Update cluster shape/spread

print("Cluster labels:", p.argmax(1)) # Final group = cluster with highest probability

Search This Blog

Buddy's Blog

ML PROGRAMS

Comments

Post a Comment

Popular Posts

BLOGS...

SYLLABUS