551 lines
14 KiB
Python
551 lines
14 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""Skin Cancer Classification.ipynb
|
|
|
|
Automatically generated by Colab.
|
|
|
|
Original file is located at
|
|
https://colab.research.google.com/drive/1Nhk-lK1OCihOAxo36qt2e1_WgdLc-EiR
|
|
|
|
# Skin Cancer Classification
|
|
**Output**: Benign / Malignant
|
|
|
|
**Model**: DenseNet121 (transfer learning)
|
|
|
|
**Dataset**: Skin Cancer: HAM10000 dataset https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000/data
|
|
|
|
| Class | Name | Value |
|
|
|-------|-------------------------------------|-----------|
|
|
| nv | Melanocytic nevi (moles) | Benign |
|
|
| bcc | Basal cell carcinoma | Malignant |
|
|
| bkl | Benign keratosis-like lesions | Benign |
|
|
| df | Dermatofibroma | Benign |
|
|
| vasc | Vascular lesions | Benign |
|
|
| mel | Melanoma | Malignant |
|
|
| akiec | Actinic keratoses / Bowen's disease | Benign |
|
|
|
|
## 1. Requirements and dataset download
|
|
"""
|
|
|
|
import sys
|
|
IN_COLAB = 'google.colab' in sys.modules
|
|
|
|
# if IN_COLAB:
|
|
# !pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow
|
|
# !pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
|
|
|
|
import kagglehub
|
|
|
|
# Download latest version
|
|
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
|
|
|
|
print("Path to dataset files:", path)
|
|
|
|
# !cp -R /kaggle/input/skin-cancer-mnist-ham10000 /content/skin-cancer-mnist-ham10000
|
|
|
|
"""## 2. Imports and setup"""
|
|
|
|
import os
|
|
import glob
|
|
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
|
|
from PIL import Image
|
|
|
|
# Source - https://stackoverflow.com/a/53586419
|
|
# Posted by korakot, modified by community. See post 'Timeline' for change history
|
|
# Retrieved 2026-05-21, License - CC BY-SA 4.0
|
|
|
|
"""### Loading dataset
|
|
|
|
We start loading the CSV dataset into a dataframe:
|
|
"""
|
|
|
|
# Path to your dataset folder
|
|
dataset_path = path
|
|
|
|
# Metadata file
|
|
metadata_path = os.path.join(dataset_path, "HAM10000_metadata.csv")
|
|
|
|
# Load CSV
|
|
df = pd.read_csv(metadata_path)
|
|
|
|
# Show first rows
|
|
print(df.head())
|
|
|
|
"""Here we load all JPG images (path format) from the dataset to the dataframe:"""
|
|
|
|
# Collect all image paths
|
|
image_paths = glob.glob(os.path.join(dataset_path, "**", "*.jpg"), recursive=True)
|
|
|
|
# Create dictionary:
|
|
# key = image_id
|
|
# value = full image path
|
|
imageid_path_dict = {
|
|
os.path.splitext(os.path.basename(x))[0]: x
|
|
for x in image_paths
|
|
}
|
|
|
|
# Add image path column
|
|
df['path'] = df['image_id'].map(imageid_path_dict.get)
|
|
|
|
# Check
|
|
print(df[['image_id', 'path']].head())
|
|
|
|
"""## 3. Dataset analysis
|
|
|
|
### Missingness
|
|
|
|
The dataset does not have any missing cell:
|
|
"""
|
|
|
|
missing = df['path'].isnull().sum()
|
|
|
|
print(f"Missing images: {missing}")
|
|
|
|
"""### Class distribution
|
|
|
|
If we analyze the distribution of lesion type, we find that there is a clear unbalance. _Melanocytic nevi_ is the most abundant class.
|
|
"""
|
|
|
|
plt.figure(figsize=(10,5))
|
|
|
|
sns.countplot(data=df, x='dx', order=df['dx'].value_counts().index)
|
|
|
|
plt.title("Lesion type distribution")
|
|
plt.xlabel("Lesion type")
|
|
plt.ylabel("Count")
|
|
plt.show()
|
|
|
|
"""### Visualize samples
|
|
|
|
We visualize some random samples of the dataset, adding on the top the lesion type they belong:
|
|
"""
|
|
|
|
fig, axes = plt.subplots(2, 4, figsize=(10,5))
|
|
|
|
for i, ax in enumerate(axes.flat):
|
|
sample = df.sample(1).iloc[0]
|
|
|
|
img = Image.open(sample['path'])
|
|
|
|
ax.imshow(img)
|
|
ax.set_title(sample['dx'])
|
|
ax.axis('off')
|
|
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
"""### Data distribution
|
|
|
|
It is interesting to study the distribution of other features in the dataset, in order to detect outliers.
|
|
|
|
We start by the age, plotting it in 20 bins. First we find that most abudnant age is arround 50 years old (from 40 to 58). One surprise is that we detect zero samples in two bins, which correspond to the ages from 55 to 60 years and 25 to 30 years old.
|
|
"""
|
|
|
|
plt.figure(figsize=(8,5))
|
|
|
|
sns.histplot(df['age'].dropna(), bins=20)
|
|
|
|
plt.title("Age Distribution")
|
|
plt.show()
|
|
|
|
"""However, sex distribution is quite balanced. Even so, there are more males than females."""
|
|
|
|
sns.countplot(data=df, x='sex')
|
|
|
|
plt.title("Sex Distribution")
|
|
plt.show()
|
|
|
|
"""Lesion localization is not equal probable in all parts of the body. Instead, back and lower extremity are the most common for skin lesions."""
|
|
|
|
plt.figure(figsize=(12,5))
|
|
|
|
sns.countplot(
|
|
data=df,
|
|
x='localization',
|
|
order=df['localization'].value_counts().index
|
|
)
|
|
|
|
plt.xticks(rotation=45)
|
|
plt.title("Lesion Localization")
|
|
plt.show()
|
|
|
|
"""### Image sizes
|
|
|
|
The size of the images is 600 x 450 pixels.
|
|
"""
|
|
|
|
sizes = []
|
|
|
|
for path in df['path'].sample(100):
|
|
img = Image.open(path)
|
|
sizes.append(img.size)
|
|
|
|
print(pd.Series(sizes).value_counts())
|
|
|
|
"""## 2. Prepare dataset
|
|
|
|
In this section, we process the previous dataframe to make it valid for the model training.
|
|
|
|
### Encoding Binary Labels
|
|
|
|
Our goal is to classify between benign and malignant. Then, we should encode the lesion type to a new feature describing both states. This new feature is numerical: 0 means benign and 1, malignant.
|
|
"""
|
|
|
|
# Mapping from dx to benign/malignant
|
|
# 0 --> benign
|
|
# 1 --> malignant
|
|
benign_malignant_dict = {
|
|
'nv': 0,
|
|
'bcc': 1,
|
|
'bkl': 0,
|
|
'df': 0,
|
|
'vasc': 0,
|
|
'mel': 1,
|
|
'akiec': 0
|
|
}
|
|
|
|
# Create new column
|
|
df['target'] = df['dx'].map(benign_malignant_dict)
|
|
|
|
# Preview
|
|
print(df[['dx', 'target']].head())
|
|
|
|
"""### Dataset split
|
|
|
|
We split the dataset into training (70%) and validation (15%) and test (15%) sets.
|
|
"""
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
train_val_df, test_df = train_test_split(
|
|
df,
|
|
test_size=0.15,
|
|
random_state=42,
|
|
stratify=df['target']
|
|
)
|
|
|
|
train_df, val_df = train_test_split(
|
|
train_val_df,
|
|
test_size=0.1765, # ≈ 15% of total dataset
|
|
random_state=42,
|
|
stratify=train_val_df['target']
|
|
)
|
|
|
|
print("Train size:", len(train_df))
|
|
print("Validation size:", len(val_df))
|
|
print("Test size:", len(test_df))
|
|
|
|
"""We convert to string format the encoded column for ImageDataGenerator."""
|
|
|
|
# Convert 'target' column to string type for ImageDataGenerator
|
|
train_df['target'] = train_df['target'].astype(str)
|
|
val_df['target'] = val_df['target'].astype(str)
|
|
|
|
print("Train target dtype after conversion:", train_df['target'].dtype)
|
|
print("Validation target dtype after conversion:", val_df['target'].dtype)
|
|
|
|
"""### Balancing classes by oversampling the minority class
|
|
|
|
We decide to handle class unbalance through three approaches:
|
|
* Minority class oversampling
|
|
* Class weight
|
|
* Data augmentation
|
|
"""
|
|
|
|
## Identify majority and minority classes in the training set
|
|
#class_counts = train_df['target'].value_counts()
|
|
#majority_class = class_counts.idxmax()
|
|
#minority_class = class_counts.idxmin()
|
|
|
|
## Get the DataFrames for majority and minority classes
|
|
#df_majority = train_df[train_df['target'] == majority_class]
|
|
#df_minority = train_df[train_df['target'] == minority_class]
|
|
|
|
## Oversample the minority class
|
|
#df_minority_oversampled = df_minority.sample(
|
|
# class_counts[majority_class], replace=True, random_state=42
|
|
#)
|
|
|
|
## Combine majority class with oversampled minority class
|
|
#train_df_balanced = pd.concat([df_majority, df_minority_oversampled])
|
|
|
|
## Shuffle the balanced DataFrame
|
|
#train_df_balanced = train_df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
|
|
|
|
#print("Original train_df class distribution:")
|
|
#print(train_df['target'].value_counts())
|
|
#print("\nBalanced train_df_balanced class distribution:")
|
|
#print(train_df_balanced['target'].value_counts())
|
|
|
|
plt.figure(figsize=(6,3))
|
|
|
|
sns.countplot(data=train_df, x='target')
|
|
|
|
plt.title("Benign vs Malignant Distribution")
|
|
plt.xlabel("Lesion Type")
|
|
plt.ylabel("Count")
|
|
plt.show()
|
|
|
|
print(train_df['target'].value_counts())
|
|
|
|
#plt.figure(figsize=(6,3))
|
|
|
|
#sns.countplot(data=train_df_balanced, x='target')
|
|
|
|
#plt.title("Benign vs Malignant Distribution")
|
|
#plt.xlabel("Lesion Type")
|
|
#plt.ylabel("Count")
|
|
#plt.show()
|
|
|
|
#print(train_df_balanced['target'].value_counts())
|
|
|
|
"""### Class weight (class imbalance)"""
|
|
|
|
from sklearn.utils.class_weight import compute_class_weight
|
|
|
|
classes = np.unique(train_df['target'])
|
|
|
|
weights = compute_class_weight(
|
|
class_weight='balanced',
|
|
classes=classes,
|
|
y=train_df['target']
|
|
)
|
|
|
|
class_weights = dict(enumerate(weights))
|
|
|
|
print(class_weights)
|
|
|
|
"""## 3. Build the model"""
|
|
|
|
import tensorflow as tf
|
|
from tensorflow.keras import layers, models
|
|
from tensorflow.keras.applications import DenseNet121
|
|
from tensorflow.keras.applications.densenet import preprocess_input
|
|
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
|
|
from tensorflow.keras.models import Model
|
|
from tensorflow.keras.optimizers import Adam
|
|
|
|
gpus = tf.config.list_physical_devices('GPU')
|
|
print("GPUs:", gpus)
|
|
|
|
strategy = tf.distribute.MirroredStrategy()
|
|
|
|
data_augmentation = tf.keras.Sequential([
|
|
tf.keras.layers.RandomFlip("horizontal"),
|
|
tf.keras.layers.RandomRotation(0.1),
|
|
tf.keras.layers.RandomZoom(0.1),
|
|
tf.keras.layers.RandomContrast(0.1),
|
|
])
|
|
|
|
base_model = DenseNet121(
|
|
weights='imagenet',
|
|
include_top=False,
|
|
input_shape=(224, 224, 3)
|
|
)
|
|
|
|
inputs = tf.keras.Input(shape=(224,224,3))
|
|
|
|
x = data_augmentation(inputs)
|
|
|
|
x = base_model(x)
|
|
x = GlobalAveragePooling2D()(x)
|
|
x = Dense(512, activation='relu')(x) # Added another Dense layer
|
|
x = Dense(256, activation='relu')(x) # Existing Dense layer
|
|
predictions = Dense(1, activation='sigmoid')(x) # Output layer for binary classification
|
|
|
|
model = Model(inputs=inputs, outputs=predictions)
|
|
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
|
|
|
|
"""## 4. Data Generators
|
|
|
|
I will prepare data generators for training and validation. The training generator will include data augmentation and preprocessing, while the validation generator will only preprocess the images.
|
|
"""
|
|
|
|
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
|
|
|
# Image dimensions
|
|
IMG_WIDTH = 224
|
|
IMG_HEIGHT = 224
|
|
|
|
# Data generators
|
|
train_datagen = ImageDataGenerator(
|
|
preprocessing_function=preprocess_input,
|
|
rotation_range=20,
|
|
width_shift_range=0.2,
|
|
height_shift_range=0.2,
|
|
shear_range=0.2,
|
|
zoom_range=0.2,
|
|
horizontal_flip=True,
|
|
fill_mode='nearest'
|
|
)
|
|
|
|
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
|
|
|
|
# Flow from dataframe
|
|
train_generator = train_datagen.flow_from_dataframe(
|
|
dataframe=train_df,
|
|
x_col='path',
|
|
y_col='target',
|
|
target_size=(IMG_WIDTH, IMG_HEIGHT),
|
|
batch_size=32,
|
|
class_mode='binary',
|
|
seed=42
|
|
)
|
|
|
|
val_generator = val_datagen.flow_from_dataframe(
|
|
dataframe=val_df,
|
|
x_col='path',
|
|
y_col='target',
|
|
target_size=(IMG_WIDTH, IMG_HEIGHT),
|
|
batch_size=32,
|
|
class_mode='binary',
|
|
seed=42
|
|
)
|
|
|
|
"""Now, the `train_generator` will use the `train_df_balanced` DataFrame, which has an equal number of samples for both classes. This will help the model learn more effectively from the minority class during training.
|
|
|
|
## 6. Train the Model
|
|
"""
|
|
|
|
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
|
|
|
|
model_checkpoint = ModelCheckpoint(
|
|
'best_model.keras',
|
|
monitor='val_accuracy',
|
|
save_best_only=True,
|
|
mode='max'
|
|
)
|
|
|
|
# Train the model
|
|
history = model.fit(
|
|
train_generator,
|
|
epochs=30, # You can adjust the number of epochs
|
|
validation_data=val_generator,
|
|
callbacks=[model_checkpoint],
|
|
class_weight=class_weights # Use class weights to handle imbalance
|
|
)
|
|
|
|
"""## 7. Evaluation"""
|
|
|
|
IMG_WIDTH = 224
|
|
IMG_HEIGHT = 224
|
|
|
|
test_datagen = ImageDataGenerator(
|
|
preprocessing_function=preprocess_input
|
|
)
|
|
|
|
test_generator = test_datagen.flow_from_dataframe(
|
|
dataframe=test_df,
|
|
x_col='path',
|
|
y_col='target',
|
|
target_size=(IMG_WIDTH, IMG_HEIGHT),
|
|
batch_size=32,
|
|
class_mode='binary',
|
|
shuffle=False # CRITICAL
|
|
)
|
|
|
|
"""### Load best model"""
|
|
|
|
from tensorflow.keras.models import load_model
|
|
|
|
best_model = load_model('best_model.keras')
|
|
|
|
"""### Evaluate on test set"""
|
|
|
|
results = best_model.evaluate(test_df)
|
|
|
|
for name, value in zip(best_model.metrics_names, results):
|
|
print(f"{name}: {value:.4f}")
|
|
|
|
"""### Predictions and Classification Report"""
|
|
|
|
from sklearn.metrics import classification_report, confusion_matrix
|
|
|
|
test_generator.reset()
|
|
y_prob = best_model.predict(test_generator)
|
|
y_pred = (y_prob > 0.5).astype(int).ravel()
|
|
|
|
y_true = val_generator.classes
|
|
|
|
print("Classification Report:")
|
|
print(classification_report(y_true, y_pred))
|
|
|
|
"""### Confusion Matrix"""
|
|
|
|
from sklearn.metrics import confusion_matrix
|
|
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
|
|
cm = confusion_matrix(y_true, y_pred)
|
|
|
|
plt.figure(figsize=(6,5))
|
|
|
|
sns.heatmap(
|
|
cm,
|
|
annot=True,
|
|
fmt='d',
|
|
cmap='Blues',
|
|
xticklabels=["Benign", "Malignant"],
|
|
yticklabels=["Benign", "Malignant"]
|
|
)
|
|
|
|
plt.xlabel("Predicted")
|
|
plt.ylabel("True")
|
|
plt.title("Test Confusion Matrix")
|
|
|
|
plt.show()
|
|
|
|
"""### Classification report"""
|
|
|
|
from sklearn.metrics import classification_report
|
|
|
|
print(classification_report(
|
|
y_true,
|
|
y_pred,
|
|
target_names=["Benign", "Malignant"]
|
|
))
|
|
|
|
"""### ROC-AUC"""
|
|
|
|
from sklearn.metrics import roc_auc_score
|
|
|
|
auc = roc_auc_score(y_true, y_prob)
|
|
|
|
print(f"ROC-AUC: {auc:.4f}")
|
|
|
|
"""### Recall"""
|
|
|
|
from sklearn.metrics import recall_score
|
|
|
|
sensitivity = recall_score(y_true, y_pred)
|
|
|
|
print(f"Sensitivity: {sensitivity:.4f}")
|
|
|
|
"""### Training History Plots"""
|
|
|
|
plt.figure(figsize=(12, 5))
|
|
|
|
# Plot training & validation accuracy values
|
|
plt.subplot(1, 2, 1)
|
|
plt.plot(history.history['accuracy'])
|
|
plt.plot(history.history['val_accuracy'])
|
|
plt.title('Model Accuracy')
|
|
plt.ylabel('Accuracy')
|
|
plt.xlabel('Epoch')
|
|
plt.legend(['Train', 'Validation'], loc='upper left')
|
|
|
|
# Plot training & validation loss values
|
|
plt.subplot(1, 2, 2)
|
|
plt.plot(history.history['loss'])
|
|
plt.plot(history.history['val_loss'])
|
|
plt.title('Model Loss')
|
|
plt.ylabel('Loss')
|
|
plt.xlabel('Epoch')
|
|
plt.legend(['Train', 'Validation'], loc='upper left')
|
|
|
|
plt.tight_layout()
|
|
plt.show()
|