ML PROGRAMS

ML PPT LINK
 ML LAB PROGRAMS


EXP1: To compute and understand the basic measures of Central Tendency (Mean, Median, Mode) and Dispersion (Variance, Standard Deviation) using Python

Program:

import numpy as np         # Library for numerical computing(used for numerical                      operations and missing values)

from scipy import stats # Library for statistical analysis

data = [85, 92, 78, 85, 95, 88, 72, 85, 90, 80] # Dataset

data_array = np.array(data) # Converting lists to NumPy array

mean_value = np.mean(data_array) # Average

median_value = np.median(data_array) # If kept in order middle value

mode_result = stats.mode(data_array) # stats.mode() will give repeated number

print(f"Mean: {mean_value:.2f}") # :.2f (decimal point)

print(f"Median: {median_value:.2f}")

print(f"Mode: {mode_result.mode[0]}")

variance_value = np.var(data_array, ddof=1) # Sample variance

std_dev_value = np.std(data_array, ddof=1)   # Sample standard deviation

print(f"Variance: {variance_value:.2f}")

print(f"Standard Deviation: {std_dev_value:.2f}")

EXP 2 : To apply fundamental data preprocessing techniques  Attribute selection, Handling Missing Values, Discretization, Elimination of Outliers using Python libraries.

import pandas as pd #used to work with tables (DataFrames)

import numpy as np #used for numerical operations (like np.nan)

from sklearn.preprocessing import KBinsDiscretizer #converts continuous values into groups (bins)

data = { #Creating the Dataset

    'Age': [25, 30, 35, 40, 22, 60],

    'Salary': [50000, 60000, np.nan, 120000, 30000, 1000000],

    'Experience': [2, 5, 8, 15, 1, 40],

    'Department': ['IT', 'HR', 'IT', 'Finance', 'HR', 'IT']

}


df = pd.DataFrame(data) #Creating a DataFrame

print("Original Dataset:\n", df)

df_selected = df[['Age', 'Salary', 'Experience']].copy() #.copy() avoids changing the original DataFrame

df_selected['Salary'] = df_selected['Salary'].fillna(df_selected['Salary'].mean())

discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform') #Divides Age into 3 equal ranges (bins)

df_selected['Age_Bin'] = discretizer.fit_transform(df_selected[['Age']]) #Converts ages into numbers:0 → young, 1 → middle- aged, 2 → older .

Q1 = df_selected['Salary'].quantile(0.25)

Q3 = df_selected['Salary'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR

upper_bound = Q3 + 1.5 * IQR

df_no_outliers = df_selected[ # Removing Outliers

    (df_selected['Salary'] >= lower_bound) &

    (df_selected['Salary'] <= upper_bound)

]

print("\nAfter Eliminating Outliers:\n", df_no_outliers)

EXP 3: To apply the K-Nearest Neighbors (KNN) algorithm for both classification and regression tasks using Python.

import numpy as np # For numerical operations

import matplotlib.pyplot as plt # For creating the scatter plot graph.

from sklearn.datasets import make_regression       # for generating data, splitting it, creating the model, and calculating error metrics.

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error, r2_score

 X, y = make_regression(n_samples=200, n_features=1, noise=0.1, random_state=42) # Generate synthetic dataset

 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Split the dataset into training and testing sets

 knn_regressor = KNeighborsRegressor(n_neighbors=5) # Create and train the KNN regressor

knn_regressor.fit(X_train, y_train)

 y_pred = knn_regressor.predict(X_test) # Make predictions on the test data

 mse = mean_squared_error(y_test, y_pred) # Evaluate the model

r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')

print(f'R-squared: {r2}')

plt.scatter(X_test, y_test, color='blue', label='Actual’) # Visualize the results

plt.scatter(X_test, y_pred, color='red', label='Predicted')

plt.title('KNN Regression')

plt.xlabel('Feature')

plt.ylabel('Target')

plt.legend()

plt.show()

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import load_iris                  # Load the Iris dataset

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Load the dataset    iris = load_iris()

X = iris.data                                          # Features (4 columns: sepal/petal length/width)

y = iris.target                                        # Target labels (0: Setosa, 1: Versicolor, 2: Virginica)

# Step 2: Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 80% training (120 samples), 20% testing (30 samples)

# Step 3: Create and train the KNN classifier

knn_classifier = KNeighborsClassifier(n_neighbors=5)   # Use 5 nearest neighbors

knn_classifier.fit(X_train, y_train)

# Step 4: Make predictions on the test data

y_pred = knn_classifier.predict(X_test)

# Step 5: Evaluate the model

accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')

print('\nClassification Report:')

print(classification_report(y_test, y_pred, target_names=iris.target_names))

print('\nConfusion Matrix:')

print(confusion_matrix(y_test, y_pred))

# Step 6 : Predict a new input sample  # Example input: [sepal length, sepal width, petal length, petal width]

input_sample = np.array([[5.1, 3.5, 1.4, 0.2]])  # Change values as needed

# Predict class

predicted_class = knn_classifier.predict(input_sample)

# Predict probabilities (optional)

predicted_prob = knn_classifier.predict_proba(input_sample)

print("\nInput Sample:", input_sample)

print("Predicted Flower Name:", iris.target_names[predicted_class[0]])

Exp 4: Demonstrate decision tree algorithm for a classification problem and perform parameter tuning for better results

import pandas as pd

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

data = load_iris() # Load dataset

X = pd.DataFrame(data.data, columns=data.feature_names)

y = pd.Series(data.target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Split data 

clf = DecisionTreeClassifier() # Default Decision Tree

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Default Accuracy:", accuracy_score(y_test, y_pred))

params = {   # Parameter Tuning using Grid Search

    'criterion': ['gini', 'entropy'],

    'max_depth': [2, 3, 4, 5, None],

    'min_samples_split': [2, 3, 4],

    'min_samples_leaf': [1, 2, 3]

}

grid = GridSearchCV(DecisionTreeClassifier(), params,cv=5, scoring='accuracy')

grid.fit(X_train, y_train)

best_model = grid.best_estimator_   # Best Model

y_best = best_model.predict(X_test)

print("Best Params:", grid.best_params_)

print("Tuned Accuracy:", accuracy_score(y_test, y_best))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_best))

print("\nClassification Report:\n", classification_report(y_test, y_best))

Exp 6a:  Apply Random Forest algorithm for classification 

# Import required libraries

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset

iris = load_iris()

X = iris.data

y = iris.target

# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Random Forest Classifier model

rf_classifier = RandomForestClassifier(

    n_estimators=100,

    criterion="gini",

    random_state=42

)

# Train the model

rf_classifier.fit(X_train, y_train)

# Make predictions

y_pred = rf_classifier.predict(X_test)

# Evaluate the model

print("Accuracy:", accuracy_score(y_test, y_pred))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

print("\nClassification Report:\n", classification_report(y_test, y_pred))




Exp 6b:  Apply Random Forest algorithm for regression 

# Import libraries

import numpy as np

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score

# Generate synthetic regression data

np.random.seed(42)

X = np.linspace(0, 10, 200).reshape(-1, 1)

y = 3 * X.squeeze()**2 + 5 * X.squeeze() + 10 + np.random.randn(200) * 10

# Split the dataset

X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.2, random_state=42

)

# Create Random Forest Regressor

rf_model = RandomForestRegressor(

    n_estimators=200,

    max_depth=10,

    min_samples_split=5,

    random_state=42

)

# Train the model

rf_model.fit(X_train, y_train)

# Predict on test data

y_pred = rf_model.predict(X_test)

# Model evaluation

print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

print("R2 Score:", r2_score(y_test, y_pred))

# Sort values for smooth plotting

sorted_idx = X_test.squeeze().argsort()

# Plot actual vs predicted values

plt.figure()

plt.scatter(X_test, y_test)

plt.plot(X_test[sorted_idx], y_pred[sorted_idx])

plt.xlabel("Input Feature (X)")

plt.ylabel("Target Value (y)")

plt.title("Random Forest Regression")

plt.show()

Exp 7:  Demonstrate Naïve Bayes Classification algorithm. 

# Import necessary libraries 

from sklearn.datasets import load_iris 

from sklearn.model_selection import train_test_split 

from sklearn.naive_bayes import GaussianNB 

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 

# Load the Iris dataset 

data = load_iris() 

X = data.data  # Features 

y = data.target  # Target labels 

# Split the data into training and testing sets (70% train, 30% test) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42

# Initialize the Gaussian Naïve Bayes classifier 

nb_classifier = GaussianNB() 

# Train the model 

nb_classifier.fit(X_train, y_train) 

# Make predictions on the test set 

y_pred = nb_classifier.predict(X_test) 

# Evaluate the model 

accuracy = accuracy_score(y_test, y_pred) 

print(f"Accuracy: {accuracy:.2f}"

# Classification Report 

print("\nClassification Report:"

print(classification_report(y_test, y_pred, target_names=data.target_names)) 

# Confusion Matrix 

print("Confusion Matrix:"

print(confusion_matrix(y_test, y_pred))

Exp 8: Apply Support Vector algorithm for classification. 

# Import necessary libraries 

from sklearn.datasets import load_iris 

from sklearn.model_selection import train_test_split 

from sklearn.svm import SVC 

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 

# Load the Iris dataset 

data = load_iris() 

X = data.data  # Features 

y = data.target  # Target labels 

# Split the data into training and testing sets (70% train, 30% test) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42

# Initialize the SVM classifier 

# Using 'linear' kernel for simplicity (you can try 'rbf', 'poly', 'sigmoid') 

svm_classifier = SVC(kernel='linear', random_state=42

# Train the model 

svm_classifier.fit(X_train, y_train) 

# Make predictions on the test set 

y_pred = svm_classifier.predict(X_test) 

# Evaluate the model 

accuracy = accuracy_score(y_test, y_pred) 

print(f"Accuracy: {accuracy:.2f}"

# Classification Report 

print("\nClassification Report:"

print(classification_report(y_test, y_pred, target_names=data.target_names)) 

# Confusion Matrix 

print("Confusion Matrix:"

print(confusion_matrix(y_test, y_pred))

Exp 9: Demonstrate simple linear regression algorithm for a regression problem. 

from sklearn.linear_model import LinearRegression 

import numpy as np 

import matplotlib.pyplot as plt 

x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) 

y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12]) 

lreg=LinearRegression() 

lreg.fit(x.reshape(-1, 1), y) 

plt.scatter(x, y) 

plt.plot(x, lreg.predict(x.reshape(-1, 1)), color='red'

plt.title("Linear Regression"

plt.xlabel("x"

plt.ylabel("y"

plt.show()

Exp 10: Apply Logistic regression algorithm for a classification problem. 

import numpy 

X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69

5.88]).reshape(-1,1

print(X) 

y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) 

from sklearn import linear_model 

logr = linear_model.LogisticRegression() 

logr.fit(X,y) 

predicted = logr.predict(numpy.array([3.46]).reshape(-1,1)) 

print(predicted) 

logr = linear_model.LogisticRegression() 

logr.fit(X,y) 

log_odds = logr.coef_ 

odds = numpy.exp(log_odds) 

print(odds) 

Exp 11: Demonstrate Multi-layer Perceptron algorithm for a classification problem. 

# Import necessary libraries 

from sklearn.datasets import load_iris 

from sklearn.model_selection import train_test_split 

from sklearn.neural_network import MLPClassifier 

from sklearn.preprocessing import StandardScaler 

from sklearn.metrics import (accuracy_score, classification_report,  

confusion_matrix, ConfusionMatrixDisplay) 

import matplotlib.pyplot as plt 

# Load the Iris dataset 

data = load_iris() 

X = data.data  # Features 

y = data.target  # Target labels 

feature_names = data.feature_names 

class_names = data.target_names 

# Split the data into training and testing sets (80% train, 20% test) 

X_train, X_test, y_train, y_test = train_test_split( 

X, y, test_size=0.2, random_state=42, stratify=y 

# Standardize features by removing the mean and scaling to unit variance 

scaler = StandardScaler() 

X_train = scaler.fit_transform(X_train) 

X_test = scaler.transform(X_test) 

# Initialize the MLP classifier 

mlp = MLPClassifier( 

    hidden_layer_sizes=(100, 50),  # Two hidden layers with 100 and 50 neurons 

    activation='relu',             # Rectified Linear Unit activation 

    solver='adam',                 # Optimization algorithm 

    alpha=0.0001,                  # L2 penalty (regularization term) parameter 

    batch_size='auto',             # Size of minibatches 

    learning_rate='constant',      # Learning rate schedule 

    learning_rate_init=0.001,      # Initial learning rate 

    max_iter=500,                  # Maximum number of iterations 

    random_state=42,               # Random seed 

    early_stopping=True,           # Use early stopping to terminate training when validation score stops improving

    validation_fraction=0.1        # Fraction of training data to set aside as validation set 

# Train the model 

mlp.fit(X_train, y_train) 

# Make predictions 

y_pred = mlp.predict(X_test) 

y_pred_prob = mlp.predict_proba(X_test) 

# Evaluate the model 

print(f"Training set score: {mlp.score(X_train, y_train):.3f}"

print(f"Test set score: {mlp.score(X_test, y_test):.3f}\n"

print("Classification Report:"

print(classification_report(y_test, y_pred, target_names=class_names)) 

 # Plot confusion matrix

Exp 12: Implement the K-means algorithm and apply it to the data you selected. Evaluate performance by measuring the sum of the Euclidean distance of each example from its class center. Test the performance of the algorithm as a function of the parameters K. 

import numpy as np

from sklearn.datasets import load_iris


X = load_iris().data  # selected data


def kmeans(X, K, i=10):

    c = X[np.random.choice(len(X), K, 0)]

    for _ in range(i):

        l = ((X[:,None]-c)**2).sum(2).argmin(1)

        c = np.array([X[l==k].mean(0) for k in range(K)])

    return l, c


def sse(X, l, c):

    return sum(np.linalg.norm(x-c[k]) for x,k in zip(X,l))


for K in range(1,6):

    l,c = kmeans(X,K)

    print(K, sse(X,l,c))

Exp 13: Demonstrate the use of Fuzzy C-Means Clustering. 

import numpy as np

from sklearn.datasets import load_iris 

data = load_iris() 

X = data.data  # 1. Load data (first 4 features), transpose

K, m, e, i = 3, 2, 1e-5, 10                     # 2. clusters, fuzziness, error, iterations

c = X[:, np.random.choice(X.shape[1], K, 0)]    # 3. Randomly init cluster centers

for _ in range(i):                              # 4. Repeat update steps

    d = np.linalg.norm(X[:, :, None]-c[:, None], axis=0) + 1e-9  # 5. Distance matrix

    u = 1 / d**(2/(m-1))                         # 6. Compute membership (unnormalized)

    u = u / u.sum(1, keepdims=1)                 # 7. Normalize memberships

    c = (X @ (u**m)) / (u**m).sum(0)             # 8. Update cluster centers

labels = u.argmax(1)                             # 9. Final cluster labels

print(labels, sum(d[n, labels[n]] for n in range(len(labels))))  #10. Labels + performance

Exp 14: Demonstrate the use of Expectation Maximization based clustering algorithm. 

import numpy as np

from sklearn.datasets import load_iris


X = load_iris().data        # Load dataset (150 flowers, 4 features)

K = 3                       # We want 3 clusters (groups)

n, d = X.shape              # n=samples, d=features

p = np.ones((n, K))/K       # Start: each point has equal chance in each cluster

m = X[np.random.choice(n, K, 0)]  # Pick random points as initial cluster centers

s = np.array([np.eye(d)]*K) # Start: clusters are assumed circular (identity matrix)


for _ in range(10):         # Repeat 10 times to improve clusters

    for k in range(K):      # ----- E STEP (Expectation) -----

        diff = X - m[k]     # Distance of all points from cluster k mean

        p[:,k] = np.exp(-.5*np.sum(diff@np.linalg.inv(s[k])*diff,1))/np.sqrt(np.linalg.det(s[k]))

    p /= p.sum(1, keepdims=True)  # Convert to real probability (sum of each row = 1)


    for k in range(K):      # ----- M STEP (Maximization) -----

        w = p[:,k]          # Weight = how much each point belongs to cluster k

        m[k] = (w@X)/w.sum()      # Update cluster center (mean)

        s[k] = ((X-m[k]).T*(w/w.sum()))@(X-m[k])  # Update cluster shape/spread


print("Cluster labels:", p.argmax(1))  # Final group = cluster with highest probability

Comments

Popular Posts