First test
This commit is contained in:
24
run.sh
24
run.sh
@@ -1,5 +1,23 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "Hello!"
|
||||
sleep 15
|
||||
echo "Goodbye!"
|
||||
set -e
|
||||
|
||||
VENV_DIR=".venv"
|
||||
|
||||
# 1. Create venv if needed
|
||||
if [[ ! -d "$VENV_DIR" ]]; then
|
||||
echo "Creating virtual environment..."
|
||||
python3 -m venv "$VENV_DIR"
|
||||
fi
|
||||
|
||||
# 2. Activate venv
|
||||
source "$VENV_DIR/bin/activate"
|
||||
|
||||
# 3. Install dependencies (lightweight, safe to re-run)
|
||||
echo "Installing dependencies..."
|
||||
pip install --upgrade pip
|
||||
pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow
|
||||
pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
|
||||
|
||||
python3 skin_cancer_classification.py
|
||||
echo "Done."
|
||||
|
||||
410
skin_cancer_classification.py
Normal file
410
skin_cancer_classification.py
Normal file
@@ -0,0 +1,410 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Skin Cancer Classification.ipynb
|
||||
|
||||
Automatically generated by Colab.
|
||||
|
||||
Original file is located at
|
||||
https://colab.research.google.com/drive/1Nhk-lK1OCihOAxo36qt2e1_WgdLc-EiR
|
||||
|
||||
# Skin Cancer Classification
|
||||
**Output**: Benign / Malignant
|
||||
|
||||
**Model**: DenseNet121 (transfer learning)
|
||||
|
||||
**Dataset**: Skin Cancer: HAM10000 dataset https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000/data
|
||||
|
||||
| Class | Name | Value |
|
||||
|-------|-------------------------------------|-----------|
|
||||
| nv | Melanocytic nevi (moles) | Benign |
|
||||
| bcc | Basal cell carcinoma | Malignant |
|
||||
| bkl | Benign keratosis-like lesions | Benign |
|
||||
| df | Dermatofibroma | Benign |
|
||||
| vasc | Vascular lesions | Benign |
|
||||
| mel | Melanoma | Malignant |
|
||||
| akiec | Actinic keratoses / Bowen's disease | Benign |
|
||||
|
||||
## 0. Requirements
|
||||
"""
|
||||
|
||||
# !pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow
|
||||
|
||||
"""## 1. Download dataset"""
|
||||
|
||||
# !pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
|
||||
|
||||
import kagglehub
|
||||
|
||||
# Download latest version
|
||||
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
|
||||
|
||||
print("Path to dataset files:", path)
|
||||
|
||||
# !cp -R /kaggle/input/skin-cancer-mnist-ham10000 /content/skin-cancer-mnist-ham10000
|
||||
|
||||
"""## 2. Imports and setup"""
|
||||
|
||||
import os
|
||||
import glob
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
from PIL import Image
|
||||
|
||||
"""### Loading dataset"""
|
||||
|
||||
# Path to your dataset folder
|
||||
dataset_path = "/kaggle/input/skin-cancer-mnist-ham10000"
|
||||
|
||||
# Metadata file
|
||||
metadata_path = os.path.join(dataset_path, "HAM10000_metadata.csv")
|
||||
|
||||
# Load CSV
|
||||
df = pd.read_csv(metadata_path)
|
||||
|
||||
# Show first rows
|
||||
print(df.head())
|
||||
|
||||
# Collect all image paths
|
||||
image_paths = glob.glob(os.path.join(dataset_path, "**", "*.jpg"), recursive=True)
|
||||
|
||||
# Create dictionary:
|
||||
# key = image_id
|
||||
# value = full image path
|
||||
imageid_path_dict = {
|
||||
os.path.splitext(os.path.basename(x))[0]: x
|
||||
for x in image_paths
|
||||
}
|
||||
|
||||
# Add image path column
|
||||
df['path'] = df['image_id'].map(imageid_path_dict.get)
|
||||
|
||||
# Check
|
||||
print(df[['image_id', 'path']].head())
|
||||
|
||||
"""## 3. Dataset analysis
|
||||
|
||||
### Missingness
|
||||
"""
|
||||
|
||||
missing = df['path'].isnull().sum()
|
||||
|
||||
print(f"Missing images: {missing}")
|
||||
|
||||
"""### Class distribution"""
|
||||
|
||||
plt.figure(figsize=(10,5))
|
||||
|
||||
sns.countplot(data=df, x='dx', order=df['dx'].value_counts().index)
|
||||
|
||||
plt.title("Class Distribution")
|
||||
plt.xlabel("Diagnosis")
|
||||
plt.ylabel("Count")
|
||||
plt.show()
|
||||
|
||||
"""### Visualize samples"""
|
||||
|
||||
fig, axes = plt.subplots(2, 4, figsize=(10,5))
|
||||
|
||||
for i, ax in enumerate(axes.flat):
|
||||
sample = df.sample(1).iloc[0]
|
||||
|
||||
img = Image.open(sample['path'])
|
||||
|
||||
ax.imshow(img)
|
||||
ax.set_title(sample['dx'])
|
||||
ax.axis('off')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
"""### Data distribution"""
|
||||
|
||||
plt.figure(figsize=(8,5))
|
||||
|
||||
sns.histplot(df['age'].dropna(), bins=20)
|
||||
|
||||
plt.title("Age Distribution")
|
||||
plt.show()
|
||||
|
||||
sns.countplot(data=df, x='sex')
|
||||
|
||||
plt.title("Sex Distribution")
|
||||
plt.show()
|
||||
|
||||
plt.figure(figsize=(12,5))
|
||||
|
||||
sns.countplot(
|
||||
data=df,
|
||||
x='localization',
|
||||
order=df['localization'].value_counts().index
|
||||
)
|
||||
|
||||
plt.xticks(rotation=45)
|
||||
plt.title("Lesion Localization")
|
||||
plt.show()
|
||||
|
||||
"""### Image sizes"""
|
||||
|
||||
sizes = []
|
||||
|
||||
for path in df['path'].sample(100):
|
||||
img = Image.open(path)
|
||||
sizes.append(img.size)
|
||||
|
||||
print(pd.Series(sizes).value_counts())
|
||||
|
||||
"""## 2. Prepare dataset
|
||||
|
||||
### Encoding Binary Labels
|
||||
"""
|
||||
|
||||
# Mapping from dx to benign/malignant
|
||||
# 0 --> benign
|
||||
# 1 --> malignant
|
||||
benign_malignant_dict = {
|
||||
'nv': 0,
|
||||
'bcc': 1,
|
||||
'bkl': 0,
|
||||
'df': 0,
|
||||
'vasc': 0,
|
||||
'mel': 1,
|
||||
'akiec': 0
|
||||
}
|
||||
|
||||
# Create new column
|
||||
df['target'] = df['dx'].map(benign_malignant_dict)
|
||||
|
||||
# Preview
|
||||
print(df[['dx', 'target']].head())
|
||||
|
||||
# Convert 'target' column to string type for ImageDataGenerator
|
||||
train_df['target'] = train_df['target'].astype(str)
|
||||
val_df['target'] = val_df['target'].astype(str)
|
||||
|
||||
print("Train target dtype after conversion:", train_df['target'].dtype)
|
||||
print("Validation target dtype after conversion:", val_df['target'].dtype)
|
||||
|
||||
plt.figure(figsize=(6,3))
|
||||
|
||||
sns.countplot(data=df, x='target')
|
||||
|
||||
plt.title("Benign vs Malignant Distribution")
|
||||
plt.xlabel("Lesion Type")
|
||||
plt.ylabel("Count")
|
||||
plt.show()
|
||||
|
||||
print(df['target'].value_counts())
|
||||
|
||||
"""### Train / Validation split"""
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
train_df, val_df = train_test_split(
|
||||
df,
|
||||
test_size=0.2,
|
||||
stratify=df['target'],
|
||||
random_state=42
|
||||
)
|
||||
|
||||
print("Train size:", len(train_df))
|
||||
print("Validation size:", len(val_df))
|
||||
|
||||
"""### Class weight (class imbalance)"""
|
||||
|
||||
from sklearn.utils.class_weight import compute_class_weight
|
||||
|
||||
classes = np.unique(train_df['target'])
|
||||
|
||||
weights = compute_class_weight(
|
||||
class_weight='balanced',
|
||||
classes=classes,
|
||||
y=train_df['target']
|
||||
)
|
||||
|
||||
class_weights = dict(enumerate(weights))
|
||||
|
||||
print(class_weights)
|
||||
|
||||
"""## 3. Build the model"""
|
||||
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras import layers, models
|
||||
from tensorflow.keras.applications import DenseNet121
|
||||
from tensorflow.keras.applications.densenet import preprocess_input
|
||||
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
|
||||
from tensorflow.keras.models import Model
|
||||
|
||||
data_augmentation = tf.keras.Sequential([
|
||||
tf.keras.layers.RandomFlip("horizontal"),
|
||||
tf.keras.layers.RandomRotation(0.1),
|
||||
tf.keras.layers.RandomZoom(0.1),
|
||||
tf.keras.layers.RandomContrast(0.1),
|
||||
])
|
||||
|
||||
base_model = DenseNet121(
|
||||
weights='imagenet',
|
||||
include_top=False,
|
||||
input_shape=(224, 224, 3)
|
||||
)
|
||||
|
||||
inputs = tf.keras.Input(shape=(224,224,3))
|
||||
|
||||
x = data_augmentation(inputs)
|
||||
|
||||
x = base_model.output
|
||||
x = GlobalAveragePooling2D()(x)
|
||||
x = Dense(512, activation='relu')(x) # Added another Dense layer
|
||||
x = Dense(256, activation='relu')(x) # Existing Dense layer
|
||||
predictions = Dense(1, activation='sigmoid')(x) # Output layer for binary classification
|
||||
|
||||
model = Model(inputs=base_model.input, outputs=predictions)
|
||||
|
||||
"""## 4. Data Generators
|
||||
|
||||
I will prepare data generators for training and validation. The training generator will include data augmentation and preprocessing, while the validation generator will only preprocess the images.
|
||||
"""
|
||||
|
||||
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
||||
|
||||
# Image dimensions
|
||||
IMG_WIDTH = 224
|
||||
IMG_HEIGHT = 224
|
||||
|
||||
# Data generators
|
||||
train_datagen = ImageDataGenerator(
|
||||
preprocessing_function=preprocess_input,
|
||||
rotation_range=20,
|
||||
width_shift_range=0.2,
|
||||
height_shift_range=0.2,
|
||||
shear_range=0.2,
|
||||
zoom_range=0.2,
|
||||
horizontal_flip=True,
|
||||
fill_mode='nearest'
|
||||
)
|
||||
|
||||
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
|
||||
|
||||
# Flow from dataframe
|
||||
train_generator = train_datagen.flow_from_dataframe(
|
||||
dataframe=train_df,
|
||||
x_col='path',
|
||||
y_col='target',
|
||||
target_size=(IMG_WIDTH, IMG_HEIGHT),
|
||||
batch_size=32,
|
||||
class_mode='binary',
|
||||
seed=42
|
||||
)
|
||||
|
||||
val_generator = val_datagen.flow_from_dataframe(
|
||||
dataframe=val_df,
|
||||
x_col='path',
|
||||
y_col='target',
|
||||
target_size=(IMG_WIDTH, IMG_HEIGHT),
|
||||
batch_size=32,
|
||||
class_mode='binary',
|
||||
seed=42
|
||||
)
|
||||
|
||||
"""## 5. Compile the Model
|
||||
|
||||
I will compile the model using the Adam optimizer, binary cross-entropy loss (suitable for binary classification), and track accuracy as a metric.
|
||||
"""
|
||||
|
||||
from tensorflow.keras.optimizers import Adam
|
||||
|
||||
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
|
||||
|
||||
"""## 6. Train the Model
|
||||
|
||||
I will now train the model using the prepared data generators. I'll also add callbacks for early stopping to prevent overfitting and to save the best model.
|
||||
"""
|
||||
|
||||
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
|
||||
|
||||
# Callbacks
|
||||
early_stopping = EarlyStopping(
|
||||
monitor='val_loss',
|
||||
patience=10,
|
||||
restore_best_weights=True
|
||||
)
|
||||
|
||||
model_checkpoint = ModelCheckpoint(
|
||||
'best_model.keras',
|
||||
monitor='val_accuracy',
|
||||
save_best_only=True,
|
||||
mode='max'
|
||||
)
|
||||
|
||||
# Train the model
|
||||
history = model.fit(
|
||||
train_generator,
|
||||
epochs=3, # You can adjust the number of epochs
|
||||
validation_data=val_generator,
|
||||
callbacks=[early_stopping, model_checkpoint],
|
||||
class_weight=class_weights # Use class weights to handle imbalance
|
||||
)
|
||||
|
||||
"""## X. Evaluation
|
||||
|
||||
### Load best model
|
||||
"""
|
||||
|
||||
from tensorflow.keras.models import load_model
|
||||
|
||||
best_model = load_model('best_model.keras')
|
||||
|
||||
"""### Evaluate on validation set"""
|
||||
|
||||
loss, accuracy = best_model.evaluate(val_generator)
|
||||
print(f"Validation Loss: {loss:.4f}")
|
||||
print(f"Validation Accuracy: {accuracy:.4f}")
|
||||
|
||||
"""### Predictions and Classification Report"""
|
||||
|
||||
from sklearn.metrics import classification_report, confusion_matrix
|
||||
|
||||
val_generator.reset() # Reset generator to ensure correct order
|
||||
y_pred_probs = best_model.predict(val_generator)
|
||||
y_pred = (y_pred_probs > 0.5).astype(int)
|
||||
|
||||
y_true = val_generator.classes
|
||||
|
||||
print("Classification Report:")
|
||||
print(classification_report(y_true, y_pred))
|
||||
|
||||
"""### Confusion Matrix"""
|
||||
|
||||
cm = confusion_matrix(y_true, y_pred)
|
||||
plt.figure(figsize=(8, 6))
|
||||
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant'])
|
||||
plt.title('Confusion Matrix')
|
||||
plt.xlabel('Predicted Label')
|
||||
plt.ylabel('True Label')
|
||||
plt.show()
|
||||
|
||||
"""### Training History Plots"""
|
||||
|
||||
plt.figure(figsize=(12, 5))
|
||||
|
||||
# Plot training & validation accuracy values
|
||||
plt.subplot(1, 2, 1)
|
||||
plt.plot(history.history['accuracy'])
|
||||
plt.plot(history.history['val_accuracy'])
|
||||
plt.title('Model Accuracy')
|
||||
plt.ylabel('Accuracy')
|
||||
plt.xlabel('Epoch')
|
||||
plt.legend(['Train', 'Validation'], loc='upper left')
|
||||
|
||||
# Plot training & validation loss values
|
||||
plt.subplot(1, 2, 2)
|
||||
plt.plot(history.history['loss'])
|
||||
plt.plot(history.history['val_loss'])
|
||||
plt.title('Model Loss')
|
||||
plt.ylabel('Loss')
|
||||
plt.xlabel('Epoch')
|
||||
plt.legend(['Train', 'Validation'], loc='upper left')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
Reference in New Issue
Block a user