First test
This commit is contained in:
24
run.sh
24
run.sh
@@ -1,5 +1,23 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
echo "Hello!"
|
set -e
|
||||||
sleep 15
|
|
||||||
echo "Goodbye!"
|
VENV_DIR=".venv"
|
||||||
|
|
||||||
|
# 1. Create venv if needed
|
||||||
|
if [[ ! -d "$VENV_DIR" ]]; then
|
||||||
|
echo "Creating virtual environment..."
|
||||||
|
python3 -m venv "$VENV_DIR"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 2. Activate venv
|
||||||
|
source "$VENV_DIR/bin/activate"
|
||||||
|
|
||||||
|
# 3. Install dependencies (lightweight, safe to re-run)
|
||||||
|
echo "Installing dependencies..."
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow
|
||||||
|
pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
|
||||||
|
|
||||||
|
python3 skin_cancer_classification.py
|
||||||
|
echo "Done."
|
||||||
|
|||||||
410
skin_cancer_classification.py
Normal file
410
skin_cancer_classification.py
Normal file
@@ -0,0 +1,410 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""Skin Cancer Classification.ipynb
|
||||||
|
|
||||||
|
Automatically generated by Colab.
|
||||||
|
|
||||||
|
Original file is located at
|
||||||
|
https://colab.research.google.com/drive/1Nhk-lK1OCihOAxo36qt2e1_WgdLc-EiR
|
||||||
|
|
||||||
|
# Skin Cancer Classification
|
||||||
|
**Output**: Benign / Malignant
|
||||||
|
|
||||||
|
**Model**: DenseNet121 (transfer learning)
|
||||||
|
|
||||||
|
**Dataset**: Skin Cancer: HAM10000 dataset https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000/data
|
||||||
|
|
||||||
|
| Class | Name | Value |
|
||||||
|
|-------|-------------------------------------|-----------|
|
||||||
|
| nv | Melanocytic nevi (moles) | Benign |
|
||||||
|
| bcc | Basal cell carcinoma | Malignant |
|
||||||
|
| bkl | Benign keratosis-like lesions | Benign |
|
||||||
|
| df | Dermatofibroma | Benign |
|
||||||
|
| vasc | Vascular lesions | Benign |
|
||||||
|
| mel | Melanoma | Malignant |
|
||||||
|
| akiec | Actinic keratoses / Bowen's disease | Benign |
|
||||||
|
|
||||||
|
## 0. Requirements
|
||||||
|
"""
|
||||||
|
|
||||||
|
# !pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow
|
||||||
|
|
||||||
|
"""## 1. Download dataset"""
|
||||||
|
|
||||||
|
# !pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
|
||||||
|
|
||||||
|
import kagglehub
|
||||||
|
|
||||||
|
# Download latest version
|
||||||
|
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
|
||||||
|
|
||||||
|
print("Path to dataset files:", path)
|
||||||
|
|
||||||
|
# !cp -R /kaggle/input/skin-cancer-mnist-ham10000 /content/skin-cancer-mnist-ham10000
|
||||||
|
|
||||||
|
"""## 2. Imports and setup"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import glob
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
"""### Loading dataset"""
|
||||||
|
|
||||||
|
# Path to your dataset folder
|
||||||
|
dataset_path = "/kaggle/input/skin-cancer-mnist-ham10000"
|
||||||
|
|
||||||
|
# Metadata file
|
||||||
|
metadata_path = os.path.join(dataset_path, "HAM10000_metadata.csv")
|
||||||
|
|
||||||
|
# Load CSV
|
||||||
|
df = pd.read_csv(metadata_path)
|
||||||
|
|
||||||
|
# Show first rows
|
||||||
|
print(df.head())
|
||||||
|
|
||||||
|
# Collect all image paths
|
||||||
|
image_paths = glob.glob(os.path.join(dataset_path, "**", "*.jpg"), recursive=True)
|
||||||
|
|
||||||
|
# Create dictionary:
|
||||||
|
# key = image_id
|
||||||
|
# value = full image path
|
||||||
|
imageid_path_dict = {
|
||||||
|
os.path.splitext(os.path.basename(x))[0]: x
|
||||||
|
for x in image_paths
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add image path column
|
||||||
|
df['path'] = df['image_id'].map(imageid_path_dict.get)
|
||||||
|
|
||||||
|
# Check
|
||||||
|
print(df[['image_id', 'path']].head())
|
||||||
|
|
||||||
|
"""## 3. Dataset analysis
|
||||||
|
|
||||||
|
### Missingness
|
||||||
|
"""
|
||||||
|
|
||||||
|
missing = df['path'].isnull().sum()
|
||||||
|
|
||||||
|
print(f"Missing images: {missing}")
|
||||||
|
|
||||||
|
"""### Class distribution"""
|
||||||
|
|
||||||
|
plt.figure(figsize=(10,5))
|
||||||
|
|
||||||
|
sns.countplot(data=df, x='dx', order=df['dx'].value_counts().index)
|
||||||
|
|
||||||
|
plt.title("Class Distribution")
|
||||||
|
plt.xlabel("Diagnosis")
|
||||||
|
plt.ylabel("Count")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
"""### Visualize samples"""
|
||||||
|
|
||||||
|
fig, axes = plt.subplots(2, 4, figsize=(10,5))
|
||||||
|
|
||||||
|
for i, ax in enumerate(axes.flat):
|
||||||
|
sample = df.sample(1).iloc[0]
|
||||||
|
|
||||||
|
img = Image.open(sample['path'])
|
||||||
|
|
||||||
|
ax.imshow(img)
|
||||||
|
ax.set_title(sample['dx'])
|
||||||
|
ax.axis('off')
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
"""### Data distribution"""
|
||||||
|
|
||||||
|
plt.figure(figsize=(8,5))
|
||||||
|
|
||||||
|
sns.histplot(df['age'].dropna(), bins=20)
|
||||||
|
|
||||||
|
plt.title("Age Distribution")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
sns.countplot(data=df, x='sex')
|
||||||
|
|
||||||
|
plt.title("Sex Distribution")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
plt.figure(figsize=(12,5))
|
||||||
|
|
||||||
|
sns.countplot(
|
||||||
|
data=df,
|
||||||
|
x='localization',
|
||||||
|
order=df['localization'].value_counts().index
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.xticks(rotation=45)
|
||||||
|
plt.title("Lesion Localization")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
"""### Image sizes"""
|
||||||
|
|
||||||
|
sizes = []
|
||||||
|
|
||||||
|
for path in df['path'].sample(100):
|
||||||
|
img = Image.open(path)
|
||||||
|
sizes.append(img.size)
|
||||||
|
|
||||||
|
print(pd.Series(sizes).value_counts())
|
||||||
|
|
||||||
|
"""## 2. Prepare dataset
|
||||||
|
|
||||||
|
### Encoding Binary Labels
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Mapping from dx to benign/malignant
|
||||||
|
# 0 --> benign
|
||||||
|
# 1 --> malignant
|
||||||
|
benign_malignant_dict = {
|
||||||
|
'nv': 0,
|
||||||
|
'bcc': 1,
|
||||||
|
'bkl': 0,
|
||||||
|
'df': 0,
|
||||||
|
'vasc': 0,
|
||||||
|
'mel': 1,
|
||||||
|
'akiec': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create new column
|
||||||
|
df['target'] = df['dx'].map(benign_malignant_dict)
|
||||||
|
|
||||||
|
# Preview
|
||||||
|
print(df[['dx', 'target']].head())
|
||||||
|
|
||||||
|
# Convert 'target' column to string type for ImageDataGenerator
|
||||||
|
train_df['target'] = train_df['target'].astype(str)
|
||||||
|
val_df['target'] = val_df['target'].astype(str)
|
||||||
|
|
||||||
|
print("Train target dtype after conversion:", train_df['target'].dtype)
|
||||||
|
print("Validation target dtype after conversion:", val_df['target'].dtype)
|
||||||
|
|
||||||
|
plt.figure(figsize=(6,3))
|
||||||
|
|
||||||
|
sns.countplot(data=df, x='target')
|
||||||
|
|
||||||
|
plt.title("Benign vs Malignant Distribution")
|
||||||
|
plt.xlabel("Lesion Type")
|
||||||
|
plt.ylabel("Count")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
print(df['target'].value_counts())
|
||||||
|
|
||||||
|
"""### Train / Validation split"""
|
||||||
|
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
train_df, val_df = train_test_split(
|
||||||
|
df,
|
||||||
|
test_size=0.2,
|
||||||
|
stratify=df['target'],
|
||||||
|
random_state=42
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Train size:", len(train_df))
|
||||||
|
print("Validation size:", len(val_df))
|
||||||
|
|
||||||
|
"""### Class weight (class imbalance)"""
|
||||||
|
|
||||||
|
from sklearn.utils.class_weight import compute_class_weight
|
||||||
|
|
||||||
|
classes = np.unique(train_df['target'])
|
||||||
|
|
||||||
|
weights = compute_class_weight(
|
||||||
|
class_weight='balanced',
|
||||||
|
classes=classes,
|
||||||
|
y=train_df['target']
|
||||||
|
)
|
||||||
|
|
||||||
|
class_weights = dict(enumerate(weights))
|
||||||
|
|
||||||
|
print(class_weights)
|
||||||
|
|
||||||
|
"""## 3. Build the model"""
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from tensorflow.keras import layers, models
|
||||||
|
from tensorflow.keras.applications import DenseNet121
|
||||||
|
from tensorflow.keras.applications.densenet import preprocess_input
|
||||||
|
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
|
||||||
|
from tensorflow.keras.models import Model
|
||||||
|
|
||||||
|
data_augmentation = tf.keras.Sequential([
|
||||||
|
tf.keras.layers.RandomFlip("horizontal"),
|
||||||
|
tf.keras.layers.RandomRotation(0.1),
|
||||||
|
tf.keras.layers.RandomZoom(0.1),
|
||||||
|
tf.keras.layers.RandomContrast(0.1),
|
||||||
|
])
|
||||||
|
|
||||||
|
base_model = DenseNet121(
|
||||||
|
weights='imagenet',
|
||||||
|
include_top=False,
|
||||||
|
input_shape=(224, 224, 3)
|
||||||
|
)
|
||||||
|
|
||||||
|
inputs = tf.keras.Input(shape=(224,224,3))
|
||||||
|
|
||||||
|
x = data_augmentation(inputs)
|
||||||
|
|
||||||
|
x = base_model.output
|
||||||
|
x = GlobalAveragePooling2D()(x)
|
||||||
|
x = Dense(512, activation='relu')(x) # Added another Dense layer
|
||||||
|
x = Dense(256, activation='relu')(x) # Existing Dense layer
|
||||||
|
predictions = Dense(1, activation='sigmoid')(x) # Output layer for binary classification
|
||||||
|
|
||||||
|
model = Model(inputs=base_model.input, outputs=predictions)
|
||||||
|
|
||||||
|
"""## 4. Data Generators
|
||||||
|
|
||||||
|
I will prepare data generators for training and validation. The training generator will include data augmentation and preprocessing, while the validation generator will only preprocess the images.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
||||||
|
|
||||||
|
# Image dimensions
|
||||||
|
IMG_WIDTH = 224
|
||||||
|
IMG_HEIGHT = 224
|
||||||
|
|
||||||
|
# Data generators
|
||||||
|
train_datagen = ImageDataGenerator(
|
||||||
|
preprocessing_function=preprocess_input,
|
||||||
|
rotation_range=20,
|
||||||
|
width_shift_range=0.2,
|
||||||
|
height_shift_range=0.2,
|
||||||
|
shear_range=0.2,
|
||||||
|
zoom_range=0.2,
|
||||||
|
horizontal_flip=True,
|
||||||
|
fill_mode='nearest'
|
||||||
|
)
|
||||||
|
|
||||||
|
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
|
||||||
|
|
||||||
|
# Flow from dataframe
|
||||||
|
train_generator = train_datagen.flow_from_dataframe(
|
||||||
|
dataframe=train_df,
|
||||||
|
x_col='path',
|
||||||
|
y_col='target',
|
||||||
|
target_size=(IMG_WIDTH, IMG_HEIGHT),
|
||||||
|
batch_size=32,
|
||||||
|
class_mode='binary',
|
||||||
|
seed=42
|
||||||
|
)
|
||||||
|
|
||||||
|
val_generator = val_datagen.flow_from_dataframe(
|
||||||
|
dataframe=val_df,
|
||||||
|
x_col='path',
|
||||||
|
y_col='target',
|
||||||
|
target_size=(IMG_WIDTH, IMG_HEIGHT),
|
||||||
|
batch_size=32,
|
||||||
|
class_mode='binary',
|
||||||
|
seed=42
|
||||||
|
)
|
||||||
|
|
||||||
|
"""## 5. Compile the Model
|
||||||
|
|
||||||
|
I will compile the model using the Adam optimizer, binary cross-entropy loss (suitable for binary classification), and track accuracy as a metric.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from tensorflow.keras.optimizers import Adam
|
||||||
|
|
||||||
|
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
|
||||||
|
|
||||||
|
"""## 6. Train the Model
|
||||||
|
|
||||||
|
I will now train the model using the prepared data generators. I'll also add callbacks for early stopping to prevent overfitting and to save the best model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
|
||||||
|
|
||||||
|
# Callbacks
|
||||||
|
early_stopping = EarlyStopping(
|
||||||
|
monitor='val_loss',
|
||||||
|
patience=10,
|
||||||
|
restore_best_weights=True
|
||||||
|
)
|
||||||
|
|
||||||
|
model_checkpoint = ModelCheckpoint(
|
||||||
|
'best_model.keras',
|
||||||
|
monitor='val_accuracy',
|
||||||
|
save_best_only=True,
|
||||||
|
mode='max'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Train the model
|
||||||
|
history = model.fit(
|
||||||
|
train_generator,
|
||||||
|
epochs=3, # You can adjust the number of epochs
|
||||||
|
validation_data=val_generator,
|
||||||
|
callbacks=[early_stopping, model_checkpoint],
|
||||||
|
class_weight=class_weights # Use class weights to handle imbalance
|
||||||
|
)
|
||||||
|
|
||||||
|
"""## X. Evaluation
|
||||||
|
|
||||||
|
### Load best model
|
||||||
|
"""
|
||||||
|
|
||||||
|
from tensorflow.keras.models import load_model
|
||||||
|
|
||||||
|
best_model = load_model('best_model.keras')
|
||||||
|
|
||||||
|
"""### Evaluate on validation set"""
|
||||||
|
|
||||||
|
loss, accuracy = best_model.evaluate(val_generator)
|
||||||
|
print(f"Validation Loss: {loss:.4f}")
|
||||||
|
print(f"Validation Accuracy: {accuracy:.4f}")
|
||||||
|
|
||||||
|
"""### Predictions and Classification Report"""
|
||||||
|
|
||||||
|
from sklearn.metrics import classification_report, confusion_matrix
|
||||||
|
|
||||||
|
val_generator.reset() # Reset generator to ensure correct order
|
||||||
|
y_pred_probs = best_model.predict(val_generator)
|
||||||
|
y_pred = (y_pred_probs > 0.5).astype(int)
|
||||||
|
|
||||||
|
y_true = val_generator.classes
|
||||||
|
|
||||||
|
print("Classification Report:")
|
||||||
|
print(classification_report(y_true, y_pred))
|
||||||
|
|
||||||
|
"""### Confusion Matrix"""
|
||||||
|
|
||||||
|
cm = confusion_matrix(y_true, y_pred)
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant'])
|
||||||
|
plt.title('Confusion Matrix')
|
||||||
|
plt.xlabel('Predicted Label')
|
||||||
|
plt.ylabel('True Label')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
"""### Training History Plots"""
|
||||||
|
|
||||||
|
plt.figure(figsize=(12, 5))
|
||||||
|
|
||||||
|
# Plot training & validation accuracy values
|
||||||
|
plt.subplot(1, 2, 1)
|
||||||
|
plt.plot(history.history['accuracy'])
|
||||||
|
plt.plot(history.history['val_accuracy'])
|
||||||
|
plt.title('Model Accuracy')
|
||||||
|
plt.ylabel('Accuracy')
|
||||||
|
plt.xlabel('Epoch')
|
||||||
|
plt.legend(['Train', 'Validation'], loc='upper left')
|
||||||
|
|
||||||
|
# Plot training & validation loss values
|
||||||
|
plt.subplot(1, 2, 2)
|
||||||
|
plt.plot(history.history['loss'])
|
||||||
|
plt.plot(history.history['val_loss'])
|
||||||
|
plt.title('Model Loss')
|
||||||
|
plt.ylabel('Loss')
|
||||||
|
plt.xlabel('Epoch')
|
||||||
|
plt.legend(['Train', 'Validation'], loc='upper left')
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.show()
|
||||||
Reference in New Issue
Block a user