Compare commits
16 Commits
0d69f340ae
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 9478b2427e | |||
| d616ec2168 | |||
| 637aa2e380 | |||
| dd3bd507b1 | |||
| b3d9c2e76e | |||
| f62842d23e | |||
| 1a7deafab7 | |||
| b930018981 | |||
| 9dc29f19e4 | |||
| b2c0110d11 | |||
| 4b56e164a1 | |||
| 2174ab1fb0 | |||
| ba3d03b186 | |||
| e1a8d38cee | |||
| 3162934b99 | |||
| 6a25385409 |
1905
Skin_Cancer_Classification.ipynb
Normal file
1905
Skin_Cancer_Classification.ipynb
Normal file
File diff suppressed because one or more lines are too long
6
run.sh
6
run.sh
@@ -16,8 +16,10 @@ source "$VENV_DIR/bin/activate"
|
|||||||
# 3. Install dependencies (lightweight, safe to re-run)
|
# 3. Install dependencies (lightweight, safe to re-run)
|
||||||
echo "Installing dependencies..."
|
echo "Installing dependencies..."
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow
|
pip install pandas numpy matplotlib seaborn pillow scikit-learn
|
||||||
|
pip install "tensorflow[and-cuda]"
|
||||||
pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
|
pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
|
||||||
|
|
||||||
python3 skin_cancer_classification.py
|
jupyter nbconvert --to notebook --execute Skin_Cancer_Classification.ipynb --output Skin_Cancer_Classification_Final.ipynb --log-level=DEBUG
|
||||||
|
# python3 skin_cancer_classification.py
|
||||||
echo "Done."
|
echo "Done."
|
||||||
|
|||||||
@@ -1,412 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""Skin Cancer Classification.ipynb
|
|
||||||
|
|
||||||
Automatically generated by Colab.
|
|
||||||
|
|
||||||
Original file is located at
|
|
||||||
https://colab.research.google.com/drive/1Nhk-lK1OCihOAxo36qt2e1_WgdLc-EiR
|
|
||||||
|
|
||||||
# Skin Cancer Classification
|
|
||||||
**Output**: Benign / Malignant
|
|
||||||
|
|
||||||
**Model**: DenseNet121 (transfer learning)
|
|
||||||
|
|
||||||
**Dataset**: Skin Cancer: HAM10000 dataset https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000/data
|
|
||||||
|
|
||||||
| Class | Name | Value |
|
|
||||||
|-------|-------------------------------------|-----------|
|
|
||||||
| nv | Melanocytic nevi (moles) | Benign |
|
|
||||||
| bcc | Basal cell carcinoma | Malignant |
|
|
||||||
| bkl | Benign keratosis-like lesions | Benign |
|
|
||||||
| df | Dermatofibroma | Benign |
|
|
||||||
| vasc | Vascular lesions | Benign |
|
|
||||||
| mel | Melanoma | Malignant |
|
|
||||||
| akiec | Actinic keratoses / Bowen's disease | Benign |
|
|
||||||
|
|
||||||
## 1. Requirements and dataset download
|
|
||||||
"""
|
|
||||||
|
|
||||||
import sys
|
|
||||||
IN_COLAB = 'google.colab' in sys.modules
|
|
||||||
|
|
||||||
# if IN_COLAB:
|
|
||||||
# !pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow
|
|
||||||
# !pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
|
|
||||||
|
|
||||||
import kagglehub
|
|
||||||
|
|
||||||
# Download latest version
|
|
||||||
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
|
|
||||||
|
|
||||||
print("Path to dataset files:", path)
|
|
||||||
|
|
||||||
# !cp -R /kaggle/input/skin-cancer-mnist-ham10000 /content/skin-cancer-mnist-ham10000
|
|
||||||
|
|
||||||
"""## 2. Imports and setup"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import glob
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import seaborn as sns
|
|
||||||
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
# Source - https://stackoverflow.com/a/53586419
|
|
||||||
# Posted by korakot, modified by community. See post 'Timeline' for change history
|
|
||||||
# Retrieved 2026-05-21, License - CC BY-SA 4.0
|
|
||||||
|
|
||||||
"""### Loading dataset"""
|
|
||||||
|
|
||||||
# Path to your dataset folder
|
|
||||||
dataset_path = path
|
|
||||||
|
|
||||||
# Metadata file
|
|
||||||
metadata_path = os.path.join(dataset_path, "HAM10000_metadata.csv")
|
|
||||||
|
|
||||||
# Load CSV
|
|
||||||
df = pd.read_csv(metadata_path)
|
|
||||||
|
|
||||||
# Show first rows
|
|
||||||
print(df.head())
|
|
||||||
|
|
||||||
# Collect all image paths
|
|
||||||
image_paths = glob.glob(os.path.join(dataset_path, "**", "*.jpg"), recursive=True)
|
|
||||||
|
|
||||||
# Create dictionary:
|
|
||||||
# key = image_id
|
|
||||||
# value = full image path
|
|
||||||
imageid_path_dict = {
|
|
||||||
os.path.splitext(os.path.basename(x))[0]: x
|
|
||||||
for x in image_paths
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add image path column
|
|
||||||
df['path'] = df['image_id'].map(imageid_path_dict.get)
|
|
||||||
|
|
||||||
# Check
|
|
||||||
print(df[['image_id', 'path']].head())
|
|
||||||
|
|
||||||
"""## 3. Dataset analysis
|
|
||||||
|
|
||||||
### Missingness
|
|
||||||
"""
|
|
||||||
|
|
||||||
missing = df['path'].isnull().sum()
|
|
||||||
|
|
||||||
print(f"Missing images: {missing}")
|
|
||||||
|
|
||||||
"""### Class distribution"""
|
|
||||||
|
|
||||||
plt.figure(figsize=(10,5))
|
|
||||||
|
|
||||||
sns.countplot(data=df, x='dx', order=df['dx'].value_counts().index)
|
|
||||||
|
|
||||||
plt.title("Class Distribution")
|
|
||||||
plt.xlabel("Diagnosis")
|
|
||||||
plt.ylabel("Count")
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
"""### Visualize samples"""
|
|
||||||
|
|
||||||
fig, axes = plt.subplots(2, 4, figsize=(10,5))
|
|
||||||
|
|
||||||
for i, ax in enumerate(axes.flat):
|
|
||||||
sample = df.sample(1).iloc[0]
|
|
||||||
|
|
||||||
img = Image.open(sample['path'])
|
|
||||||
|
|
||||||
ax.imshow(img)
|
|
||||||
ax.set_title(sample['dx'])
|
|
||||||
ax.axis('off')
|
|
||||||
|
|
||||||
plt.tight_layout()
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
"""### Data distribution"""
|
|
||||||
|
|
||||||
plt.figure(figsize=(8,5))
|
|
||||||
|
|
||||||
sns.histplot(df['age'].dropna(), bins=20)
|
|
||||||
|
|
||||||
plt.title("Age Distribution")
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
sns.countplot(data=df, x='sex')
|
|
||||||
|
|
||||||
plt.title("Sex Distribution")
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
plt.figure(figsize=(12,5))
|
|
||||||
|
|
||||||
sns.countplot(
|
|
||||||
data=df,
|
|
||||||
x='localization',
|
|
||||||
order=df['localization'].value_counts().index
|
|
||||||
)
|
|
||||||
|
|
||||||
plt.xticks(rotation=45)
|
|
||||||
plt.title("Lesion Localization")
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
"""### Image sizes"""
|
|
||||||
|
|
||||||
sizes = []
|
|
||||||
|
|
||||||
for path in df['path'].sample(100):
|
|
||||||
img = Image.open(path)
|
|
||||||
sizes.append(img.size)
|
|
||||||
|
|
||||||
print(pd.Series(sizes).value_counts())
|
|
||||||
|
|
||||||
"""## 2. Prepare dataset
|
|
||||||
|
|
||||||
### Encoding Binary Labels
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Mapping from dx to benign/malignant
|
|
||||||
# 0 --> benign
|
|
||||||
# 1 --> malignant
|
|
||||||
benign_malignant_dict = {
|
|
||||||
'nv': 0,
|
|
||||||
'bcc': 1,
|
|
||||||
'bkl': 0,
|
|
||||||
'df': 0,
|
|
||||||
'vasc': 0,
|
|
||||||
'mel': 1,
|
|
||||||
'akiec': 0
|
|
||||||
}
|
|
||||||
|
|
||||||
# Create new column
|
|
||||||
df['target'] = df['dx'].map(benign_malignant_dict)
|
|
||||||
|
|
||||||
# Preview
|
|
||||||
print(df[['dx', 'target']].head())
|
|
||||||
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
|
|
||||||
train_df, val_df = train_test_split(
|
|
||||||
df,
|
|
||||||
test_size=0.2,
|
|
||||||
stratify=df['target'],
|
|
||||||
random_state=42
|
|
||||||
)
|
|
||||||
|
|
||||||
print("Train size:", len(train_df))
|
|
||||||
print("Validation size:", len(val_df))
|
|
||||||
|
|
||||||
# Convert 'target' column to string type for ImageDataGenerator
|
|
||||||
train_df['target'] = train_df['target'].astype(str)
|
|
||||||
val_df['target'] = val_df['target'].astype(str)
|
|
||||||
|
|
||||||
print("Train target dtype after conversion:", train_df['target'].dtype)
|
|
||||||
print("Validation target dtype after conversion:", val_df['target'].dtype)
|
|
||||||
|
|
||||||
plt.figure(figsize=(6,3))
|
|
||||||
|
|
||||||
sns.countplot(data=df, x='target')
|
|
||||||
|
|
||||||
plt.title("Benign vs Malignant Distribution")
|
|
||||||
plt.xlabel("Lesion Type")
|
|
||||||
plt.ylabel("Count")
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
print(df['target'].value_counts())
|
|
||||||
|
|
||||||
"""### Train / Validation split
|
|
||||||
|
|
||||||
### Class weight (class imbalance)
|
|
||||||
"""
|
|
||||||
|
|
||||||
from sklearn.utils.class_weight import compute_class_weight
|
|
||||||
|
|
||||||
classes = np.unique(train_df['target'])
|
|
||||||
|
|
||||||
weights = compute_class_weight(
|
|
||||||
class_weight='balanced',
|
|
||||||
classes=classes,
|
|
||||||
y=train_df['target']
|
|
||||||
)
|
|
||||||
|
|
||||||
class_weights = dict(enumerate(weights))
|
|
||||||
|
|
||||||
print(class_weights)
|
|
||||||
|
|
||||||
"""## 3. Build the model"""
|
|
||||||
|
|
||||||
import tensorflow as tf
|
|
||||||
from tensorflow.keras import layers, models
|
|
||||||
from tensorflow.keras.applications import DenseNet121
|
|
||||||
from tensorflow.keras.applications.densenet import preprocess_input
|
|
||||||
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
|
|
||||||
from tensorflow.keras.models import Model
|
|
||||||
from tensorflow.keras.optimizers import Adam
|
|
||||||
|
|
||||||
gpus = tf.config.list_physical_devices('GPU')
|
|
||||||
print("GPUs:", gpus)
|
|
||||||
|
|
||||||
strategy = tf.distribute.MirroredStrategy()
|
|
||||||
|
|
||||||
data_augmentation = tf.keras.Sequential([
|
|
||||||
tf.keras.layers.RandomFlip("horizontal"),
|
|
||||||
tf.keras.layers.RandomRotation(0.1),
|
|
||||||
tf.keras.layers.RandomZoom(0.1),
|
|
||||||
tf.keras.layers.RandomContrast(0.1),
|
|
||||||
])
|
|
||||||
|
|
||||||
base_model = DenseNet121(
|
|
||||||
weights='imagenet',
|
|
||||||
include_top=False,
|
|
||||||
input_shape=(224, 224, 3)
|
|
||||||
)
|
|
||||||
|
|
||||||
inputs = tf.keras.Input(shape=(224,224,3))
|
|
||||||
|
|
||||||
x = data_augmentation(inputs)
|
|
||||||
|
|
||||||
x = base_model.output
|
|
||||||
x = GlobalAveragePooling2D()(x)
|
|
||||||
x = Dense(512, activation='relu')(x) # Added another Dense layer
|
|
||||||
x = Dense(256, activation='relu')(x) # Existing Dense layer
|
|
||||||
predictions = Dense(1, activation='sigmoid')(x) # Output layer for binary classification
|
|
||||||
|
|
||||||
with strategy.scope(): # Use all gpus
|
|
||||||
model = Model(inputs=base_model.input, outputs=predictions)
|
|
||||||
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
|
|
||||||
|
|
||||||
"""## 4. Data Generators
|
|
||||||
|
|
||||||
I will prepare data generators for training and validation. The training generator will include data augmentation and preprocessing, while the validation generator will only preprocess the images.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
|
||||||
|
|
||||||
# Image dimensions
|
|
||||||
IMG_WIDTH = 224
|
|
||||||
IMG_HEIGHT = 224
|
|
||||||
|
|
||||||
# Data generators
|
|
||||||
train_datagen = ImageDataGenerator(
|
|
||||||
preprocessing_function=preprocess_input,
|
|
||||||
rotation_range=20,
|
|
||||||
width_shift_range=0.2,
|
|
||||||
height_shift_range=0.2,
|
|
||||||
shear_range=0.2,
|
|
||||||
zoom_range=0.2,
|
|
||||||
horizontal_flip=True,
|
|
||||||
fill_mode='nearest'
|
|
||||||
)
|
|
||||||
|
|
||||||
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
|
|
||||||
|
|
||||||
# Flow from dataframe
|
|
||||||
train_generator = train_datagen.flow_from_dataframe(
|
|
||||||
dataframe=train_df,
|
|
||||||
x_col='path',
|
|
||||||
y_col='target',
|
|
||||||
target_size=(IMG_WIDTH, IMG_HEIGHT),
|
|
||||||
batch_size=32,
|
|
||||||
class_mode='binary',
|
|
||||||
seed=42
|
|
||||||
)
|
|
||||||
|
|
||||||
val_generator = val_datagen.flow_from_dataframe(
|
|
||||||
dataframe=val_df,
|
|
||||||
x_col='path',
|
|
||||||
y_col='target',
|
|
||||||
target_size=(IMG_WIDTH, IMG_HEIGHT),
|
|
||||||
batch_size=32,
|
|
||||||
class_mode='binary',
|
|
||||||
seed=42
|
|
||||||
)
|
|
||||||
|
|
||||||
"""## 6. Train the Model"""
|
|
||||||
|
|
||||||
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
|
|
||||||
|
|
||||||
# Callbacks
|
|
||||||
early_stopping = EarlyStopping(
|
|
||||||
monitor='val_loss',
|
|
||||||
patience=10,
|
|
||||||
restore_best_weights=True
|
|
||||||
)
|
|
||||||
|
|
||||||
model_checkpoint = ModelCheckpoint(
|
|
||||||
'best_model.keras',
|
|
||||||
monitor='val_accuracy',
|
|
||||||
save_best_only=True,
|
|
||||||
mode='max'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Train the model
|
|
||||||
history = model.fit(
|
|
||||||
train_generator,
|
|
||||||
epochs=3, # You can adjust the number of epochs
|
|
||||||
validation_data=val_generator,
|
|
||||||
callbacks=[early_stopping, model_checkpoint],
|
|
||||||
class_weight=class_weights # Use class weights to handle imbalance
|
|
||||||
)
|
|
||||||
|
|
||||||
"""## 7. Evaluation
|
|
||||||
|
|
||||||
### Load best model
|
|
||||||
"""
|
|
||||||
|
|
||||||
from tensorflow.keras.models import load_model
|
|
||||||
|
|
||||||
best_model = load_model('best_model.keras')
|
|
||||||
|
|
||||||
"""### Evaluate on validation set"""
|
|
||||||
|
|
||||||
loss, accuracy = best_model.evaluate(val_generator)
|
|
||||||
print(f"Validation Loss: {loss:.4f}")
|
|
||||||
print(f"Validation Accuracy: {accuracy:.4f}")
|
|
||||||
|
|
||||||
"""### Predictions and Classification Report"""
|
|
||||||
|
|
||||||
from sklearn.metrics import classification_report, confusion_matrix
|
|
||||||
|
|
||||||
val_generator.reset() # Reset generator to ensure correct order
|
|
||||||
y_pred_probs = best_model.predict(val_generator)
|
|
||||||
y_pred = (y_pred_probs > 0.5).astype(int)
|
|
||||||
|
|
||||||
y_true = val_generator.classes
|
|
||||||
|
|
||||||
print("Classification Report:")
|
|
||||||
print(classification_report(y_true, y_pred))
|
|
||||||
|
|
||||||
"""### Confusion Matrix"""
|
|
||||||
|
|
||||||
cm = confusion_matrix(y_true, y_pred)
|
|
||||||
plt.figure(figsize=(8, 6))
|
|
||||||
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant'])
|
|
||||||
plt.title('Confusion Matrix')
|
|
||||||
plt.xlabel('Predicted Label')
|
|
||||||
plt.ylabel('True Label')
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
"""### Training History Plots"""
|
|
||||||
|
|
||||||
plt.figure(figsize=(12, 5))
|
|
||||||
|
|
||||||
# Plot training & validation accuracy values
|
|
||||||
plt.subplot(1, 2, 1)
|
|
||||||
plt.plot(history.history['accuracy'])
|
|
||||||
plt.plot(history.history['val_accuracy'])
|
|
||||||
plt.title('Model Accuracy')
|
|
||||||
plt.ylabel('Accuracy')
|
|
||||||
plt.xlabel('Epoch')
|
|
||||||
plt.legend(['Train', 'Validation'], loc='upper left')
|
|
||||||
|
|
||||||
# Plot training & validation loss values
|
|
||||||
plt.subplot(1, 2, 2)
|
|
||||||
plt.plot(history.history['loss'])
|
|
||||||
plt.plot(history.history['val_loss'])
|
|
||||||
plt.title('Model Loss')
|
|
||||||
plt.ylabel('Loss')
|
|
||||||
plt.xlabel('Epoch')
|
|
||||||
plt.legend(['Train', 'Validation'], loc='upper left')
|
|
||||||
|
|
||||||
plt.tight_layout()
|
|
||||||
plt.show()
|
|
||||||
Reference in New Issue
Block a user