Test

Final final
We are done
2026-06-03 18:08:13 +02:00 · 2026-05-26 16:10:39 +02:00 · 2026-05-26 16:00:50 +02:00 · 2026-05-26 15:09:15 +02:00 · 2026-05-26 15:07:02 +02:00 · 2026-05-26 14:31:57 +02:00
3 changed files with 1909 additions and 414 deletions
--- a/Skin_Cancer_Classification.ipynb
+++ b/Skin_Cancer_Classification.ipynb
--- a/run.sh
+++ b/run.sh
@@ -16,8 +16,10 @@ source "$VENV_DIR/bin/activate"
 # 3. Install dependencies (lightweight, safe to re-run)
 echo "Installing dependencies..."
 pip install --upgrade pip
-pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow
+pip install pandas numpy matplotlib seaborn pillow scikit-learn
 pip install "tensorflow[and-cuda]"
 pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
-python3 skin_cancer_classification.py
+jupyter nbconvert --to notebook --execute Skin_Cancer_Classification.ipynb --output Skin_Cancer_Classification_Final.ipynb --log-level=DEBUG
 # python3 skin_cancer_classification.py
 echo "Done."
--- a/skin_cancer_classification.py
+++ b/skin_cancer_classification.py
@@ -1,412 +0,0 @@
 # -*- coding: utf-8 -*-
 """Skin Cancer Classification.ipynb
 Automatically generated by Colab.
 Original file is located at
    https://colab.research.google.com/drive/1Nhk-lK1OCihOAxo36qt2e1_WgdLc-EiR
 # Skin Cancer Classification
 **Output**: Benign / Malignant
 **Model**: DenseNet121 (transfer learning)
 **Dataset**: Skin Cancer: HAM10000 dataset https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000/data
 | Class | Name                                | Value     |
 |-------|-------------------------------------|-----------|
 | nv    | Melanocytic nevi (moles)            | Benign    |
 | bcc   | Basal cell carcinoma                | Malignant |
 | bkl   | Benign keratosis-like lesions       | Benign    |
 | df    | Dermatofibroma                      | Benign    |
 | vasc  | Vascular lesions                    | Benign    |
 | mel   | Melanoma                            | Malignant |
 | akiec | Actinic keratoses / Bowen's disease | Benign    |
 ## 1. Requirements and dataset download
 """
 import sys
 IN_COLAB = 'google.colab' in sys.modules
 # if IN_COLAB:
 #  !pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow
 #  !pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
 import kagglehub
 # Download latest version
 path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
 print("Path to dataset files:", path)
 # !cp -R /kaggle/input/skin-cancer-mnist-ham10000 /content/skin-cancer-mnist-ham10000
 """## 2. Imports and setup"""
 import os
 import glob
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 from PIL import Image
 # Source - https://stackoverflow.com/a/53586419
 # Posted by korakot, modified by community. See post 'Timeline' for change history
 # Retrieved 2026-05-21, License - CC BY-SA 4.0
 """### Loading dataset"""
 # Path to your dataset folder
 dataset_path = path
 # Metadata file
 metadata_path = os.path.join(dataset_path, "HAM10000_metadata.csv")
 # Load CSV
 df = pd.read_csv(metadata_path)
 # Show first rows
 print(df.head())
 # Collect all image paths
 image_paths = glob.glob(os.path.join(dataset_path, "**", "*.jpg"), recursive=True)
 # Create dictionary:
 # key = image_id
 # value = full image path
 imageid_path_dict = {
    os.path.splitext(os.path.basename(x))[0]: x
    for x in image_paths
 }
 # Add image path column
 df['path'] = df['image_id'].map(imageid_path_dict.get)
 # Check
 print(df[['image_id', 'path']].head())
 """## 3. Dataset analysis
 ### Missingness
 """
 missing = df['path'].isnull().sum()
 print(f"Missing images: {missing}")
 """### Class distribution"""
 plt.figure(figsize=(10,5))
 sns.countplot(data=df, x='dx', order=df['dx'].value_counts().index)
 plt.title("Class Distribution")
 plt.xlabel("Diagnosis")
 plt.ylabel("Count")
 plt.show()
 """### Visualize samples"""
 fig, axes = plt.subplots(2, 4, figsize=(10,5))
 for i, ax in enumerate(axes.flat):
    sample = df.sample(1).iloc[0]
    img = Image.open(sample['path'])
    ax.imshow(img)
    ax.set_title(sample['dx'])
    ax.axis('off')
 plt.tight_layout()
 plt.show()
 """### Data distribution"""
 plt.figure(figsize=(8,5))
 sns.histplot(df['age'].dropna(), bins=20)
 plt.title("Age Distribution")
 plt.show()
 sns.countplot(data=df, x='sex')
 plt.title("Sex Distribution")
 plt.show()
 plt.figure(figsize=(12,5))
 sns.countplot(
    data=df,
    x='localization',
    order=df['localization'].value_counts().index
 )
 plt.xticks(rotation=45)
 plt.title("Lesion Localization")
 plt.show()
 """### Image sizes"""
 sizes = []
 for path in df['path'].sample(100):
    img = Image.open(path)
    sizes.append(img.size)
 print(pd.Series(sizes).value_counts())
 """## 2. Prepare dataset
 ### Encoding Binary Labels
 """
 # Mapping from dx to benign/malignant
 # 0 --> benign
 # 1 --> malignant
 benign_malignant_dict = {
    'nv': 0,
    'bcc': 1,
    'bkl': 0,
    'df': 0,
    'vasc': 0,
    'mel': 1,
    'akiec': 0
 }
 # Create new column
 df['target'] = df['dx'].map(benign_malignant_dict)
 # Preview
 print(df[['dx', 'target']].head())
 from sklearn.model_selection import train_test_split
 train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['target'],
    random_state=42
 )
 print("Train size:", len(train_df))
 print("Validation size:", len(val_df))
 # Convert 'target' column to string type for ImageDataGenerator
 train_df['target'] = train_df['target'].astype(str)
 val_df['target'] = val_df['target'].astype(str)
 print("Train target dtype after conversion:", train_df['target'].dtype)
 print("Validation target dtype after conversion:", val_df['target'].dtype)
 plt.figure(figsize=(6,3))
 sns.countplot(data=df, x='target')
 plt.title("Benign vs Malignant Distribution")
 plt.xlabel("Lesion Type")
 plt.ylabel("Count")
 plt.show()
 print(df['target'].value_counts())
 """### Train / Validation split
 ### Class weight (class imbalance)
 """
 from sklearn.utils.class_weight import compute_class_weight
 classes = np.unique(train_df['target'])
 weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=train_df['target']
 )
 class_weights = dict(enumerate(weights))
 print(class_weights)
 """## 3. Build the model"""
 import tensorflow as tf
 from tensorflow.keras import layers, models
 from tensorflow.keras.applications import DenseNet121
 from tensorflow.keras.applications.densenet import preprocess_input
 from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
 from tensorflow.keras.models import Model
 from tensorflow.keras.optimizers import Adam
 gpus = tf.config.list_physical_devices('GPU')
 print("GPUs:", gpus)
 strategy = tf.distribute.MirroredStrategy()
 data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.1),
    tf.keras.layers.RandomZoom(0.1),
    tf.keras.layers.RandomContrast(0.1),
 ])
 base_model = DenseNet121(
    weights='imagenet',
    include_top=False,
    input_shape=(224, 224, 3)
 )
 inputs = tf.keras.Input(shape=(224,224,3))
 x = data_augmentation(inputs)
 x = base_model.output
 x = GlobalAveragePooling2D()(x)
 x = Dense(512, activation='relu')(x) # Added another Dense layer
 x = Dense(256, activation='relu')(x) # Existing Dense layer
 predictions = Dense(1, activation='sigmoid')(x) # Output layer for binary classification
 with strategy.scope(): # Use all gpus
  model = Model(inputs=base_model.input, outputs=predictions)
  model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
 """## 4. Data Generators
 I will prepare data generators for training and validation. The training generator will include data augmentation and preprocessing, while the validation generator will only preprocess the images.
 """
 from tensorflow.keras.preprocessing.image import ImageDataGenerator
 # Image dimensions
 IMG_WIDTH = 224
 IMG_HEIGHT = 224
 # Data generators
 train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
 )
 val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
 # Flow from dataframe
 train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col='path',
    y_col='target',
    target_size=(IMG_WIDTH, IMG_HEIGHT),
    batch_size=32,
    class_mode='binary',
    seed=42
 )
 val_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df,
    x_col='path',
    y_col='target',
    target_size=(IMG_WIDTH, IMG_HEIGHT),
    batch_size=32,
    class_mode='binary',
    seed=42
 )
 """## 6. Train the Model"""
 from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
 # Callbacks
 early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
 )
 model_checkpoint = ModelCheckpoint(
    'best_model.keras',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max'
 )
 # Train the model
 history = model.fit(
    train_generator,
    epochs=3, # You can adjust the number of epochs
    validation_data=val_generator,
    callbacks=[early_stopping, model_checkpoint],
    class_weight=class_weights # Use class weights to handle imbalance
 )
 """## 7. Evaluation
 ### Load best model
 """
 from tensorflow.keras.models import load_model
 best_model = load_model('best_model.keras')
 """### Evaluate on validation set"""
 loss, accuracy = best_model.evaluate(val_generator)
 print(f"Validation Loss: {loss:.4f}")
 print(f"Validation Accuracy: {accuracy:.4f}")
 """### Predictions and Classification Report"""
 from sklearn.metrics import classification_report, confusion_matrix
 val_generator.reset() # Reset generator to ensure correct order
 y_pred_probs = best_model.predict(val_generator)
 y_pred = (y_pred_probs > 0.5).astype(int)
 y_true = val_generator.classes
 print("Classification Report:")
 print(classification_report(y_true, y_pred))
 """### Confusion Matrix"""
 cm = confusion_matrix(y_true, y_pred)
 plt.figure(figsize=(8, 6))
 sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant'])
 plt.title('Confusion Matrix')
 plt.xlabel('Predicted Label')
 plt.ylabel('True Label')
 plt.show()
 """### Training History Plots"""
 plt.figure(figsize=(12, 5))
 # Plot training & validation accuracy values
 plt.subplot(1, 2, 1)
 plt.plot(history.history['accuracy'])
 plt.plot(history.history['val_accuracy'])
 plt.title('Model Accuracy')
 plt.ylabel('Accuracy')
 plt.xlabel('Epoch')
 plt.legend(['Train', 'Validation'], loc='upper left')
 # Plot training & validation loss values
 plt.subplot(1, 2, 2)
 plt.plot(history.history['loss'])
 plt.plot(history.history['val_loss'])
 plt.title('Model Loss')
 plt.ylabel('Loss')
 plt.xlabel('Epoch')
 plt.legend(['Train', 'Validation'], loc='upper left')
 plt.tight_layout()
 plt.show()
Author	SHA1	Message	Date
BinarySandia04	9478b2427e	Test	2026-06-03 18:08:13 +02:00
BinarySandia04	d616ec2168	Final final	2026-05-26 16:10:39 +02:00
BinarySandia04	637aa2e380	We are done	2026-05-26 16:00:50 +02:00
BinarySandia04	dd3bd507b1	Debug	2026-05-26 15:09:15 +02:00
BinarySandia04	b3d9c2e76e	Si	2026-05-26 15:07:02 +02:00
BinarySandia04	f62842d23e	Whatever	2026-05-26 14:31:57 +02:00
BinarySandia04	1a7deafab7	Test notebook run	2026-05-26 14:14:30 +02:00
BinarySandia04	b930018981	Final2	2026-05-25 20:29:44 +02:00
BinarySandia04	9dc29f19e4	Final?	2026-05-25 20:25:44 +02:00
BinarySandia04	b2c0110d11	ok no	2026-05-21 17:53:27 +02:00
BinarySandia04	4b56e164a1	Parellization ok!	2026-05-21 17:45:28 +02:00
BinarySandia04	2174ab1fb0	Ok now we want to get better accuracy	2026-05-21 17:33:34 +02:00
BinarySandia04	ba3d03b186	now yes	2026-05-21 15:42:07 +02:00
BinarySandia04	e1a8d38cee	typo	2026-05-21 15:10:30 +02:00
BinarySandia04	3162934b99	ok	2026-05-21 15:05:51 +02:00
BinarySandia04	6a25385409	Test	2026-05-21 14:59:29 +02:00