# -*- coding: utf-8 -*- """Skin Cancer Classification.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1Nhk-lK1OCihOAxo36qt2e1_WgdLc-EiR # Skin Cancer Classification **Output**: Benign / Malignant **Model**: DenseNet121 (transfer learning) **Dataset**: Skin Cancer: HAM10000 dataset https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000/data | Class | Name | Value | |-------|-------------------------------------|-----------| | nv | Melanocytic nevi (moles) | Benign | | bcc | Basal cell carcinoma | Malignant | | bkl | Benign keratosis-like lesions | Benign | | df | Dermatofibroma | Benign | | vasc | Vascular lesions | Benign | | mel | Melanoma | Malignant | | akiec | Actinic keratoses / Bowen's disease | Benign | ## 1. Requirements and dataset download """ import sys IN_COLAB = 'google.colab' in sys.modules # if IN_COLAB: # !pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow # !pip install --upgrade kagglehub[pandas-datasets,hf-datasets] import kagglehub # Download latest version path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000") print("Path to dataset files:", path) # !cp -R /kaggle/input/skin-cancer-mnist-ham10000 /content/skin-cancer-mnist-ham10000 """## 2. Imports and setup""" import os import glob import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from PIL import Image # Source - https://stackoverflow.com/a/53586419 # Posted by korakot, modified by community. See post 'Timeline' for change history # Retrieved 2026-05-21, License - CC BY-SA 4.0 """### Loading dataset""" # Path to your dataset folder dataset_path = path # Metadata file metadata_path = os.path.join(dataset_path, "HAM10000_metadata.csv") # Load CSV df = pd.read_csv(metadata_path) # Show first rows print(df.head()) # Collect all image paths image_paths = glob.glob(os.path.join(dataset_path, "**", "*.jpg"), recursive=True) # Create dictionary: # key = image_id # value = full image path imageid_path_dict = { os.path.splitext(os.path.basename(x))[0]: x for x in image_paths } # Add image path column df['path'] = df['image_id'].map(imageid_path_dict.get) # Check print(df[['image_id', 'path']].head()) """## 3. Dataset analysis ### Missingness """ missing = df['path'].isnull().sum() print(f"Missing images: {missing}") """### Class distribution""" plt.figure(figsize=(10,5)) sns.countplot(data=df, x='dx', order=df['dx'].value_counts().index) plt.title("Class Distribution") plt.xlabel("Diagnosis") plt.ylabel("Count") plt.show() """### Visualize samples""" fig, axes = plt.subplots(2, 4, figsize=(10,5)) for i, ax in enumerate(axes.flat): sample = df.sample(1).iloc[0] img = Image.open(sample['path']) ax.imshow(img) ax.set_title(sample['dx']) ax.axis('off') plt.tight_layout() plt.show() """### Data distribution""" plt.figure(figsize=(8,5)) sns.histplot(df['age'].dropna(), bins=20) plt.title("Age Distribution") plt.show() sns.countplot(data=df, x='sex') plt.title("Sex Distribution") plt.show() plt.figure(figsize=(12,5)) sns.countplot( data=df, x='localization', order=df['localization'].value_counts().index ) plt.xticks(rotation=45) plt.title("Lesion Localization") plt.show() """### Image sizes""" sizes = [] for path in df['path'].sample(100): img = Image.open(path) sizes.append(img.size) print(pd.Series(sizes).value_counts()) """## 2. Prepare dataset ### Encoding Binary Labels """ # Mapping from dx to benign/malignant # 0 --> benign # 1 --> malignant benign_malignant_dict = { 'nv': 0, 'bcc': 1, 'bkl': 0, 'df': 0, 'vasc': 0, 'mel': 1, 'akiec': 0 } # Create new column df['target'] = df['dx'].map(benign_malignant_dict) # Preview print(df[['dx', 'target']].head()) from sklearn.model_selection import train_test_split train_df, val_df = train_test_split( df, test_size=0.2, stratify=df['target'], random_state=42 ) print("Train size:", len(train_df)) print("Validation size:", len(val_df)) # Convert 'target' column to string type for ImageDataGenerator train_df['target'] = train_df['target'].astype(str) val_df['target'] = val_df['target'].astype(str) print("Train target dtype after conversion:", train_df['target'].dtype) print("Validation target dtype after conversion:", val_df['target'].dtype) plt.figure(figsize=(6,3)) sns.countplot(data=df, x='target') plt.title("Benign vs Malignant Distribution") plt.xlabel("Lesion Type") plt.ylabel("Count") plt.show() print(df['target'].value_counts()) """### Train / Validation split ### Class weight (class imbalance) """ from sklearn.utils.class_weight import compute_class_weight classes = np.unique(train_df['target']) weights = compute_class_weight( class_weight='balanced', classes=classes, y=train_df['target'] ) class_weights = dict(enumerate(weights)) print(class_weights) """## 3. Build the model""" import tensorflow as tf from tensorflow.keras import layers, models from tensorflow.keras.applications import DenseNet121 from tensorflow.keras.applications.densenet import preprocess_input from tensorflow.keras.layers import Dense, GlobalAveragePooling2D from tensorflow.keras.models import Model from tensorflow.keras.optimizers import Adam gpus = tf.config.list_physical_devices('GPU') print("GPUs:", gpus) strategy = tf.distribute.MirroredStrategy() data_augmentation = tf.keras.Sequential([ tf.keras.layers.RandomFlip("horizontal"), tf.keras.layers.RandomRotation(0.1), tf.keras.layers.RandomZoom(0.1), tf.keras.layers.RandomContrast(0.1), ]) base_model = DenseNet121( weights='imagenet', include_top=False, input_shape=(224, 224, 3) ) inputs = tf.keras.Input(shape=(224,224,3)) x = data_augmentation(inputs) x = base_model.output x = GlobalAveragePooling2D()(x) x = Dense(512, activation='relu')(x) # Added another Dense layer x = Dense(256, activation='relu')(x) # Existing Dense layer predictions = Dense(1, activation='sigmoid')(x) # Output layer for binary classification # with strategy.scope(): # Use all gpus model = Model(inputs=base_model.input, outputs=predictions) model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy']) """## 4. Data Generators I will prepare data generators for training and validation. The training generator will include data augmentation and preprocessing, while the validation generator will only preprocess the images. """ from tensorflow.keras.preprocessing.image import ImageDataGenerator # Image dimensions IMG_WIDTH = 224 IMG_HEIGHT = 224 # Data generators train_datagen = ImageDataGenerator( preprocessing_function=preprocess_input, rotation_range=20, width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode='nearest' ) val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input) # Flow from dataframe train_generator = train_datagen.flow_from_dataframe( dataframe=train_df, x_col='path', y_col='target', target_size=(IMG_WIDTH, IMG_HEIGHT), batch_size=32, class_mode='binary', seed=42 ) val_generator = val_datagen.flow_from_dataframe( dataframe=val_df, x_col='path', y_col='target', target_size=(IMG_WIDTH, IMG_HEIGHT), batch_size=32, class_mode='binary', seed=42 ) """## 6. Train the Model""" from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint # Callbacks early_stopping = EarlyStopping( monitor='val_loss', patience=10, restore_best_weights=True ) model_checkpoint = ModelCheckpoint( 'best_model.keras', monitor='val_accuracy', save_best_only=True, mode='max' ) # Train the model print("Training model...") history = model.fit( train_generator, epochs=50, # You can adjust the number of epochs validation_data=val_generator, callbacks=[early_stopping, model_checkpoint], class_weight=class_weights # Use class weights to handle imbalance ) """## 7. Evaluation ### Load best model """ from tensorflow.keras.models import load_model best_model = load_model('best_model.keras') """### Evaluate on validation set""" loss, accuracy = best_model.evaluate(val_generator) print(f"Validation Loss: {loss:.4f}") print(f"Validation Accuracy: {accuracy:.4f}") """### Predictions and Classification Report""" from sklearn.metrics import classification_report, confusion_matrix val_generator.reset() # Reset generator to ensure correct order y_pred_probs = best_model.predict(val_generator) y_pred = (y_pred_probs > 0.5).astype(int) y_true = val_generator.classes print("Classification Report:") print(classification_report(y_true, y_pred)) """### Confusion Matrix""" cm = confusion_matrix(y_true, y_pred) plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant']) plt.title('Confusion Matrix') plt.xlabel('Predicted Label') plt.ylabel('True Label') plt.show() """### Training History Plots""" plt.figure(figsize=(12, 5)) # Plot training & validation accuracy values plt.subplot(1, 2, 1) plt.plot(history.history['accuracy']) plt.plot(history.history['val_accuracy']) plt.title('Model Accuracy') plt.ylabel('Accuracy') plt.xlabel('Epoch') plt.legend(['Train', 'Validation'], loc='upper left') # Plot training & validation loss values plt.subplot(1, 2, 2) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Model Loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Validation'], loc='upper left') plt.tight_layout() plt.show()