411 lines
9.6 KiB
Python
411 lines
9.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""Skin Cancer Classification.ipynb
|
|
|
|
Automatically generated by Colab.
|
|
|
|
Original file is located at
|
|
https://colab.research.google.com/drive/1Nhk-lK1OCihOAxo36qt2e1_WgdLc-EiR
|
|
|
|
# Skin Cancer Classification
|
|
**Output**: Benign / Malignant
|
|
|
|
**Model**: DenseNet121 (transfer learning)
|
|
|
|
**Dataset**: Skin Cancer: HAM10000 dataset https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000/data
|
|
|
|
| Class | Name | Value |
|
|
|-------|-------------------------------------|-----------|
|
|
| nv | Melanocytic nevi (moles) | Benign |
|
|
| bcc | Basal cell carcinoma | Malignant |
|
|
| bkl | Benign keratosis-like lesions | Benign |
|
|
| df | Dermatofibroma | Benign |
|
|
| vasc | Vascular lesions | Benign |
|
|
| mel | Melanoma | Malignant |
|
|
| akiec | Actinic keratoses / Bowen's disease | Benign |
|
|
|
|
## 0. Requirements
|
|
"""
|
|
|
|
# !pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow
|
|
|
|
"""## 1. Download dataset"""
|
|
|
|
# !pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
|
|
|
|
import kagglehub
|
|
|
|
# Download latest version
|
|
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
|
|
|
|
print("Path to dataset files:", path)
|
|
|
|
# !cp -R /kaggle/input/skin-cancer-mnist-ham10000 /content/skin-cancer-mnist-ham10000
|
|
|
|
"""## 2. Imports and setup"""
|
|
|
|
import os
|
|
import glob
|
|
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
|
|
from PIL import Image
|
|
|
|
"""### Loading dataset"""
|
|
|
|
# Path to your dataset folder
|
|
dataset_path = "/kaggle/input/skin-cancer-mnist-ham10000"
|
|
|
|
# Metadata file
|
|
metadata_path = os.path.join(dataset_path, "HAM10000_metadata.csv")
|
|
|
|
# Load CSV
|
|
df = pd.read_csv(metadata_path)
|
|
|
|
# Show first rows
|
|
print(df.head())
|
|
|
|
# Collect all image paths
|
|
image_paths = glob.glob(os.path.join(dataset_path, "**", "*.jpg"), recursive=True)
|
|
|
|
# Create dictionary:
|
|
# key = image_id
|
|
# value = full image path
|
|
imageid_path_dict = {
|
|
os.path.splitext(os.path.basename(x))[0]: x
|
|
for x in image_paths
|
|
}
|
|
|
|
# Add image path column
|
|
df['path'] = df['image_id'].map(imageid_path_dict.get)
|
|
|
|
# Check
|
|
print(df[['image_id', 'path']].head())
|
|
|
|
"""## 3. Dataset analysis
|
|
|
|
### Missingness
|
|
"""
|
|
|
|
missing = df['path'].isnull().sum()
|
|
|
|
print(f"Missing images: {missing}")
|
|
|
|
"""### Class distribution"""
|
|
|
|
plt.figure(figsize=(10,5))
|
|
|
|
sns.countplot(data=df, x='dx', order=df['dx'].value_counts().index)
|
|
|
|
plt.title("Class Distribution")
|
|
plt.xlabel("Diagnosis")
|
|
plt.ylabel("Count")
|
|
plt.show()
|
|
|
|
"""### Visualize samples"""
|
|
|
|
fig, axes = plt.subplots(2, 4, figsize=(10,5))
|
|
|
|
for i, ax in enumerate(axes.flat):
|
|
sample = df.sample(1).iloc[0]
|
|
|
|
img = Image.open(sample['path'])
|
|
|
|
ax.imshow(img)
|
|
ax.set_title(sample['dx'])
|
|
ax.axis('off')
|
|
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
"""### Data distribution"""
|
|
|
|
plt.figure(figsize=(8,5))
|
|
|
|
sns.histplot(df['age'].dropna(), bins=20)
|
|
|
|
plt.title("Age Distribution")
|
|
plt.show()
|
|
|
|
sns.countplot(data=df, x='sex')
|
|
|
|
plt.title("Sex Distribution")
|
|
plt.show()
|
|
|
|
plt.figure(figsize=(12,5))
|
|
|
|
sns.countplot(
|
|
data=df,
|
|
x='localization',
|
|
order=df['localization'].value_counts().index
|
|
)
|
|
|
|
plt.xticks(rotation=45)
|
|
plt.title("Lesion Localization")
|
|
plt.show()
|
|
|
|
"""### Image sizes"""
|
|
|
|
sizes = []
|
|
|
|
for path in df['path'].sample(100):
|
|
img = Image.open(path)
|
|
sizes.append(img.size)
|
|
|
|
print(pd.Series(sizes).value_counts())
|
|
|
|
"""## 2. Prepare dataset
|
|
|
|
### Encoding Binary Labels
|
|
"""
|
|
|
|
# Mapping from dx to benign/malignant
|
|
# 0 --> benign
|
|
# 1 --> malignant
|
|
benign_malignant_dict = {
|
|
'nv': 0,
|
|
'bcc': 1,
|
|
'bkl': 0,
|
|
'df': 0,
|
|
'vasc': 0,
|
|
'mel': 1,
|
|
'akiec': 0
|
|
}
|
|
|
|
# Create new column
|
|
df['target'] = df['dx'].map(benign_malignant_dict)
|
|
|
|
# Preview
|
|
print(df[['dx', 'target']].head())
|
|
|
|
# Convert 'target' column to string type for ImageDataGenerator
|
|
train_df['target'] = train_df['target'].astype(str)
|
|
val_df['target'] = val_df['target'].astype(str)
|
|
|
|
print("Train target dtype after conversion:", train_df['target'].dtype)
|
|
print("Validation target dtype after conversion:", val_df['target'].dtype)
|
|
|
|
plt.figure(figsize=(6,3))
|
|
|
|
sns.countplot(data=df, x='target')
|
|
|
|
plt.title("Benign vs Malignant Distribution")
|
|
plt.xlabel("Lesion Type")
|
|
plt.ylabel("Count")
|
|
plt.show()
|
|
|
|
print(df['target'].value_counts())
|
|
|
|
"""### Train / Validation split"""
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
train_df, val_df = train_test_split(
|
|
df,
|
|
test_size=0.2,
|
|
stratify=df['target'],
|
|
random_state=42
|
|
)
|
|
|
|
print("Train size:", len(train_df))
|
|
print("Validation size:", len(val_df))
|
|
|
|
"""### Class weight (class imbalance)"""
|
|
|
|
from sklearn.utils.class_weight import compute_class_weight
|
|
|
|
classes = np.unique(train_df['target'])
|
|
|
|
weights = compute_class_weight(
|
|
class_weight='balanced',
|
|
classes=classes,
|
|
y=train_df['target']
|
|
)
|
|
|
|
class_weights = dict(enumerate(weights))
|
|
|
|
print(class_weights)
|
|
|
|
"""## 3. Build the model"""
|
|
|
|
import tensorflow as tf
|
|
from tensorflow.keras import layers, models
|
|
from tensorflow.keras.applications import DenseNet121
|
|
from tensorflow.keras.applications.densenet import preprocess_input
|
|
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
|
|
from tensorflow.keras.models import Model
|
|
|
|
data_augmentation = tf.keras.Sequential([
|
|
tf.keras.layers.RandomFlip("horizontal"),
|
|
tf.keras.layers.RandomRotation(0.1),
|
|
tf.keras.layers.RandomZoom(0.1),
|
|
tf.keras.layers.RandomContrast(0.1),
|
|
])
|
|
|
|
base_model = DenseNet121(
|
|
weights='imagenet',
|
|
include_top=False,
|
|
input_shape=(224, 224, 3)
|
|
)
|
|
|
|
inputs = tf.keras.Input(shape=(224,224,3))
|
|
|
|
x = data_augmentation(inputs)
|
|
|
|
x = base_model.output
|
|
x = GlobalAveragePooling2D()(x)
|
|
x = Dense(512, activation='relu')(x) # Added another Dense layer
|
|
x = Dense(256, activation='relu')(x) # Existing Dense layer
|
|
predictions = Dense(1, activation='sigmoid')(x) # Output layer for binary classification
|
|
|
|
model = Model(inputs=base_model.input, outputs=predictions)
|
|
|
|
"""## 4. Data Generators
|
|
|
|
I will prepare data generators for training and validation. The training generator will include data augmentation and preprocessing, while the validation generator will only preprocess the images.
|
|
"""
|
|
|
|
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
|
|
|
# Image dimensions
|
|
IMG_WIDTH = 224
|
|
IMG_HEIGHT = 224
|
|
|
|
# Data generators
|
|
train_datagen = ImageDataGenerator(
|
|
preprocessing_function=preprocess_input,
|
|
rotation_range=20,
|
|
width_shift_range=0.2,
|
|
height_shift_range=0.2,
|
|
shear_range=0.2,
|
|
zoom_range=0.2,
|
|
horizontal_flip=True,
|
|
fill_mode='nearest'
|
|
)
|
|
|
|
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
|
|
|
|
# Flow from dataframe
|
|
train_generator = train_datagen.flow_from_dataframe(
|
|
dataframe=train_df,
|
|
x_col='path',
|
|
y_col='target',
|
|
target_size=(IMG_WIDTH, IMG_HEIGHT),
|
|
batch_size=32,
|
|
class_mode='binary',
|
|
seed=42
|
|
)
|
|
|
|
val_generator = val_datagen.flow_from_dataframe(
|
|
dataframe=val_df,
|
|
x_col='path',
|
|
y_col='target',
|
|
target_size=(IMG_WIDTH, IMG_HEIGHT),
|
|
batch_size=32,
|
|
class_mode='binary',
|
|
seed=42
|
|
)
|
|
|
|
"""## 5. Compile the Model
|
|
|
|
I will compile the model using the Adam optimizer, binary cross-entropy loss (suitable for binary classification), and track accuracy as a metric.
|
|
"""
|
|
|
|
from tensorflow.keras.optimizers import Adam
|
|
|
|
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
|
|
|
|
"""## 6. Train the Model
|
|
|
|
I will now train the model using the prepared data generators. I'll also add callbacks for early stopping to prevent overfitting and to save the best model.
|
|
"""
|
|
|
|
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
|
|
|
|
# Callbacks
|
|
early_stopping = EarlyStopping(
|
|
monitor='val_loss',
|
|
patience=10,
|
|
restore_best_weights=True
|
|
)
|
|
|
|
model_checkpoint = ModelCheckpoint(
|
|
'best_model.keras',
|
|
monitor='val_accuracy',
|
|
save_best_only=True,
|
|
mode='max'
|
|
)
|
|
|
|
# Train the model
|
|
history = model.fit(
|
|
train_generator,
|
|
epochs=3, # You can adjust the number of epochs
|
|
validation_data=val_generator,
|
|
callbacks=[early_stopping, model_checkpoint],
|
|
class_weight=class_weights # Use class weights to handle imbalance
|
|
)
|
|
|
|
"""## X. Evaluation
|
|
|
|
### Load best model
|
|
"""
|
|
|
|
from tensorflow.keras.models import load_model
|
|
|
|
best_model = load_model('best_model.keras')
|
|
|
|
"""### Evaluate on validation set"""
|
|
|
|
loss, accuracy = best_model.evaluate(val_generator)
|
|
print(f"Validation Loss: {loss:.4f}")
|
|
print(f"Validation Accuracy: {accuracy:.4f}")
|
|
|
|
"""### Predictions and Classification Report"""
|
|
|
|
from sklearn.metrics import classification_report, confusion_matrix
|
|
|
|
val_generator.reset() # Reset generator to ensure correct order
|
|
y_pred_probs = best_model.predict(val_generator)
|
|
y_pred = (y_pred_probs > 0.5).astype(int)
|
|
|
|
y_true = val_generator.classes
|
|
|
|
print("Classification Report:")
|
|
print(classification_report(y_true, y_pred))
|
|
|
|
"""### Confusion Matrix"""
|
|
|
|
cm = confusion_matrix(y_true, y_pred)
|
|
plt.figure(figsize=(8, 6))
|
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant'])
|
|
plt.title('Confusion Matrix')
|
|
plt.xlabel('Predicted Label')
|
|
plt.ylabel('True Label')
|
|
plt.show()
|
|
|
|
"""### Training History Plots"""
|
|
|
|
plt.figure(figsize=(12, 5))
|
|
|
|
# Plot training & validation accuracy values
|
|
plt.subplot(1, 2, 1)
|
|
plt.plot(history.history['accuracy'])
|
|
plt.plot(history.history['val_accuracy'])
|
|
plt.title('Model Accuracy')
|
|
plt.ylabel('Accuracy')
|
|
plt.xlabel('Epoch')
|
|
plt.legend(['Train', 'Validation'], loc='upper left')
|
|
|
|
# Plot training & validation loss values
|
|
plt.subplot(1, 2, 2)
|
|
plt.plot(history.history['loss'])
|
|
plt.plot(history.history['val_loss'])
|
|
plt.title('Model Loss')
|
|
plt.ylabel('Loss')
|
|
plt.xlabel('Epoch')
|
|
plt.legend(['Train', 'Validation'], loc='upper left')
|
|
|
|
plt.tight_layout()
|
|
plt.show()
|