Compare commits

..

16 Commits

Author SHA1 Message Date
9478b2427e Test 2026-06-03 18:08:13 +02:00
d616ec2168 Final final 2026-05-26 16:10:39 +02:00
637aa2e380 We are done 2026-05-26 16:00:50 +02:00
dd3bd507b1 Debug 2026-05-26 15:09:15 +02:00
b3d9c2e76e Si 2026-05-26 15:07:02 +02:00
f62842d23e Whatever 2026-05-26 14:31:57 +02:00
1a7deafab7 Test notebook run 2026-05-26 14:14:30 +02:00
b930018981 Final2 2026-05-25 20:29:44 +02:00
9dc29f19e4 Final? 2026-05-25 20:25:44 +02:00
b2c0110d11 ok no 2026-05-21 17:53:27 +02:00
4b56e164a1 Parellization ok! 2026-05-21 17:45:28 +02:00
2174ab1fb0 Ok now we want to get better accuracy 2026-05-21 17:33:34 +02:00
ba3d03b186 now yes 2026-05-21 15:42:07 +02:00
e1a8d38cee typo 2026-05-21 15:10:30 +02:00
3162934b99 ok 2026-05-21 15:05:51 +02:00
6a25385409 Test 2026-05-21 14:59:29 +02:00
3 changed files with 1909 additions and 414 deletions

File diff suppressed because one or more lines are too long

6
run.sh
View File

@@ -16,8 +16,10 @@ source "$VENV_DIR/bin/activate"
# 3. Install dependencies (lightweight, safe to re-run)
echo "Installing dependencies..."
pip install --upgrade pip
pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow
pip install pandas numpy matplotlib seaborn pillow scikit-learn
pip install "tensorflow[and-cuda]"
pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
python3 skin_cancer_classification.py
jupyter nbconvert --to notebook --execute Skin_Cancer_Classification.ipynb --output Skin_Cancer_Classification_Final.ipynb --log-level=DEBUG
# python3 skin_cancer_classification.py
echo "Done."

View File

@@ -1,412 +0,0 @@
# -*- coding: utf-8 -*-
"""Skin Cancer Classification.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1Nhk-lK1OCihOAxo36qt2e1_WgdLc-EiR
# Skin Cancer Classification
**Output**: Benign / Malignant
**Model**: DenseNet121 (transfer learning)
**Dataset**: Skin Cancer: HAM10000 dataset https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000/data
| Class | Name | Value |
|-------|-------------------------------------|-----------|
| nv | Melanocytic nevi (moles) | Benign |
| bcc | Basal cell carcinoma | Malignant |
| bkl | Benign keratosis-like lesions | Benign |
| df | Dermatofibroma | Benign |
| vasc | Vascular lesions | Benign |
| mel | Melanoma | Malignant |
| akiec | Actinic keratoses / Bowen's disease | Benign |
## 1. Requirements and dataset download
"""
import sys
IN_COLAB = 'google.colab' in sys.modules
# if IN_COLAB:
# !pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow
# !pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
import kagglehub
# Download latest version
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
print("Path to dataset files:", path)
# !cp -R /kaggle/input/skin-cancer-mnist-ham10000 /content/skin-cancer-mnist-ham10000
"""## 2. Imports and setup"""
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
# Source - https://stackoverflow.com/a/53586419
# Posted by korakot, modified by community. See post 'Timeline' for change history
# Retrieved 2026-05-21, License - CC BY-SA 4.0
"""### Loading dataset"""
# Path to your dataset folder
dataset_path = path
# Metadata file
metadata_path = os.path.join(dataset_path, "HAM10000_metadata.csv")
# Load CSV
df = pd.read_csv(metadata_path)
# Show first rows
print(df.head())
# Collect all image paths
image_paths = glob.glob(os.path.join(dataset_path, "**", "*.jpg"), recursive=True)
# Create dictionary:
# key = image_id
# value = full image path
imageid_path_dict = {
os.path.splitext(os.path.basename(x))[0]: x
for x in image_paths
}
# Add image path column
df['path'] = df['image_id'].map(imageid_path_dict.get)
# Check
print(df[['image_id', 'path']].head())
"""## 3. Dataset analysis
### Missingness
"""
missing = df['path'].isnull().sum()
print(f"Missing images: {missing}")
"""### Class distribution"""
plt.figure(figsize=(10,5))
sns.countplot(data=df, x='dx', order=df['dx'].value_counts().index)
plt.title("Class Distribution")
plt.xlabel("Diagnosis")
plt.ylabel("Count")
plt.show()
"""### Visualize samples"""
fig, axes = plt.subplots(2, 4, figsize=(10,5))
for i, ax in enumerate(axes.flat):
sample = df.sample(1).iloc[0]
img = Image.open(sample['path'])
ax.imshow(img)
ax.set_title(sample['dx'])
ax.axis('off')
plt.tight_layout()
plt.show()
"""### Data distribution"""
plt.figure(figsize=(8,5))
sns.histplot(df['age'].dropna(), bins=20)
plt.title("Age Distribution")
plt.show()
sns.countplot(data=df, x='sex')
plt.title("Sex Distribution")
plt.show()
plt.figure(figsize=(12,5))
sns.countplot(
data=df,
x='localization',
order=df['localization'].value_counts().index
)
plt.xticks(rotation=45)
plt.title("Lesion Localization")
plt.show()
"""### Image sizes"""
sizes = []
for path in df['path'].sample(100):
img = Image.open(path)
sizes.append(img.size)
print(pd.Series(sizes).value_counts())
"""## 2. Prepare dataset
### Encoding Binary Labels
"""
# Mapping from dx to benign/malignant
# 0 --> benign
# 1 --> malignant
benign_malignant_dict = {
'nv': 0,
'bcc': 1,
'bkl': 0,
'df': 0,
'vasc': 0,
'mel': 1,
'akiec': 0
}
# Create new column
df['target'] = df['dx'].map(benign_malignant_dict)
# Preview
print(df[['dx', 'target']].head())
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(
df,
test_size=0.2,
stratify=df['target'],
random_state=42
)
print("Train size:", len(train_df))
print("Validation size:", len(val_df))
# Convert 'target' column to string type for ImageDataGenerator
train_df['target'] = train_df['target'].astype(str)
val_df['target'] = val_df['target'].astype(str)
print("Train target dtype after conversion:", train_df['target'].dtype)
print("Validation target dtype after conversion:", val_df['target'].dtype)
plt.figure(figsize=(6,3))
sns.countplot(data=df, x='target')
plt.title("Benign vs Malignant Distribution")
plt.xlabel("Lesion Type")
plt.ylabel("Count")
plt.show()
print(df['target'].value_counts())
"""### Train / Validation split
### Class weight (class imbalance)
"""
from sklearn.utils.class_weight import compute_class_weight
classes = np.unique(train_df['target'])
weights = compute_class_weight(
class_weight='balanced',
classes=classes,
y=train_df['target']
)
class_weights = dict(enumerate(weights))
print(class_weights)
"""## 3. Build the model"""
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.applications.densenet import preprocess_input
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
gpus = tf.config.list_physical_devices('GPU')
print("GPUs:", gpus)
strategy = tf.distribute.MirroredStrategy()
data_augmentation = tf.keras.Sequential([
tf.keras.layers.RandomFlip("horizontal"),
tf.keras.layers.RandomRotation(0.1),
tf.keras.layers.RandomZoom(0.1),
tf.keras.layers.RandomContrast(0.1),
])
base_model = DenseNet121(
weights='imagenet',
include_top=False,
input_shape=(224, 224, 3)
)
inputs = tf.keras.Input(shape=(224,224,3))
x = data_augmentation(inputs)
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(512, activation='relu')(x) # Added another Dense layer
x = Dense(256, activation='relu')(x) # Existing Dense layer
predictions = Dense(1, activation='sigmoid')(x) # Output layer for binary classification
with strategy.scope(): # Use all gpus
model = Model(inputs=base_model.input, outputs=predictions)
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
"""## 4. Data Generators
I will prepare data generators for training and validation. The training generator will include data augmentation and preprocessing, while the validation generator will only preprocess the images.
"""
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# Image dimensions
IMG_WIDTH = 224
IMG_HEIGHT = 224
# Data generators
train_datagen = ImageDataGenerator(
preprocessing_function=preprocess_input,
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest'
)
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
# Flow from dataframe
train_generator = train_datagen.flow_from_dataframe(
dataframe=train_df,
x_col='path',
y_col='target',
target_size=(IMG_WIDTH, IMG_HEIGHT),
batch_size=32,
class_mode='binary',
seed=42
)
val_generator = val_datagen.flow_from_dataframe(
dataframe=val_df,
x_col='path',
y_col='target',
target_size=(IMG_WIDTH, IMG_HEIGHT),
batch_size=32,
class_mode='binary',
seed=42
)
"""## 6. Train the Model"""
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
# Callbacks
early_stopping = EarlyStopping(
monitor='val_loss',
patience=10,
restore_best_weights=True
)
model_checkpoint = ModelCheckpoint(
'best_model.keras',
monitor='val_accuracy',
save_best_only=True,
mode='max'
)
# Train the model
history = model.fit(
train_generator,
epochs=3, # You can adjust the number of epochs
validation_data=val_generator,
callbacks=[early_stopping, model_checkpoint],
class_weight=class_weights # Use class weights to handle imbalance
)
"""## 7. Evaluation
### Load best model
"""
from tensorflow.keras.models import load_model
best_model = load_model('best_model.keras')
"""### Evaluate on validation set"""
loss, accuracy = best_model.evaluate(val_generator)
print(f"Validation Loss: {loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")
"""### Predictions and Classification Report"""
from sklearn.metrics import classification_report, confusion_matrix
val_generator.reset() # Reset generator to ensure correct order
y_pred_probs = best_model.predict(val_generator)
y_pred = (y_pred_probs > 0.5).astype(int)
y_true = val_generator.classes
print("Classification Report:")
print(classification_report(y_true, y_pred))
"""### Confusion Matrix"""
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
"""### Training History Plots"""
plt.figure(figsize=(12, 5))
# Plot training & validation accuracy values
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.tight_layout()
plt.show()