First test

2026-05-21 13:45:42 +02:00
parent c7d4dbdea0
commit d7b39f6c9f
2 changed files with 431 additions and 3 deletions
--- a/run.sh
+++ b/run.sh
@@ -1,5 +1,23 @@
 #!/bin/bash

-echo "Hello!"
-sleep 15
-echo "Goodbye!"
+set -e
+
+VENV_DIR=".venv"
+
+# 1. Create venv if needed
+if [[ ! -d "$VENV_DIR" ]]; then
+  echo "Creating virtual environment..."
+  python3 -m venv "$VENV_DIR"
+fi
+
+# 2. Activate venv
+source "$VENV_DIR/bin/activate"
+
+# 3. Install dependencies (lightweight, safe to re-run)
+echo "Installing dependencies..."
+pip install --upgrade pip
+pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow
+pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
+
+python3 skin_cancer_classification.py
+echo "Done."
--- a/skin_cancer_classification.py
+++ b/skin_cancer_classification.py
@@ -0,0 +1,410 @@
+# -*- coding: utf-8 -*-
+"""Skin Cancer Classification.ipynb
+
+Automatically generated by Colab.
+
+Original file is located at
+    https://colab.research.google.com/drive/1Nhk-lK1OCihOAxo36qt2e1_WgdLc-EiR
+
+# Skin Cancer Classification
+**Output**: Benign / Malignant
+
+**Model**: DenseNet121 (transfer learning)
+
+**Dataset**: Skin Cancer: HAM10000 dataset https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000/data
+
+| Class | Name                                | Value     |
+|-------|-------------------------------------|-----------|
+| nv    | Melanocytic nevi (moles)            | Benign    |
+| bcc   | Basal cell carcinoma                | Malignant |
+| bkl   | Benign keratosis-like lesions       | Benign    |
+| df    | Dermatofibroma                      | Benign    |
+| vasc  | Vascular lesions                    | Benign    |
+| mel   | Melanoma                            | Malignant |
+| akiec | Actinic keratoses / Bowen's disease | Benign    |
+
+## 0. Requirements
+"""
+
+# !pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow
+
+"""## 1. Download dataset"""
+
+# !pip install --upgrade kagglehub[pandas-datasets,hf-datasets]
+
+import kagglehub
+
+# Download latest version
+path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
+
+print("Path to dataset files:", path)
+
+# !cp -R /kaggle/input/skin-cancer-mnist-ham10000 /content/skin-cancer-mnist-ham10000
+
+"""## 2. Imports and setup"""
+
+import os
+import glob
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+from PIL import Image
+
+"""### Loading dataset"""
+
+# Path to your dataset folder
+dataset_path = "/kaggle/input/skin-cancer-mnist-ham10000"
+
+# Metadata file
+metadata_path = os.path.join(dataset_path, "HAM10000_metadata.csv")
+
+# Load CSV
+df = pd.read_csv(metadata_path)
+
+# Show first rows
+print(df.head())
+
+# Collect all image paths
+image_paths = glob.glob(os.path.join(dataset_path, "**", "*.jpg"), recursive=True)
+
+# Create dictionary:
+# key = image_id
+# value = full image path
+imageid_path_dict = {
+    os.path.splitext(os.path.basename(x))[0]: x
+    for x in image_paths
+}
+
+# Add image path column
+df['path'] = df['image_id'].map(imageid_path_dict.get)
+
+# Check
+print(df[['image_id', 'path']].head())
+
+"""## 3. Dataset analysis
+
+### Missingness
+"""
+
+missing = df['path'].isnull().sum()
+
+print(f"Missing images: {missing}")
+
+"""### Class distribution"""
+
+plt.figure(figsize=(10,5))
+
+sns.countplot(data=df, x='dx', order=df['dx'].value_counts().index)
+
+plt.title("Class Distribution")
+plt.xlabel("Diagnosis")
+plt.ylabel("Count")
+plt.show()
+
+"""### Visualize samples"""
+
+fig, axes = plt.subplots(2, 4, figsize=(10,5))
+
+for i, ax in enumerate(axes.flat):
+    sample = df.sample(1).iloc[0]
+
+    img = Image.open(sample['path'])
+
+    ax.imshow(img)
+    ax.set_title(sample['dx'])
+    ax.axis('off')
+
+plt.tight_layout()
+plt.show()
+
+"""### Data distribution"""
+
+plt.figure(figsize=(8,5))
+
+sns.histplot(df['age'].dropna(), bins=20)
+
+plt.title("Age Distribution")
+plt.show()
+
+sns.countplot(data=df, x='sex')
+
+plt.title("Sex Distribution")
+plt.show()
+
+plt.figure(figsize=(12,5))
+
+sns.countplot(
+    data=df,
+    x='localization',
+    order=df['localization'].value_counts().index
+)
+
+plt.xticks(rotation=45)
+plt.title("Lesion Localization")
+plt.show()
+
+"""### Image sizes"""
+
+sizes = []
+
+for path in df['path'].sample(100):
+    img = Image.open(path)
+    sizes.append(img.size)
+
+print(pd.Series(sizes).value_counts())
+
+"""## 2. Prepare dataset
+
+### Encoding Binary Labels
+"""
+
+# Mapping from dx to benign/malignant
+# 0 --> benign
+# 1 --> malignant
+benign_malignant_dict = {
+    'nv': 0,
+    'bcc': 1,
+    'bkl': 0,
+    'df': 0,
+    'vasc': 0,
+    'mel': 1,
+    'akiec': 0
+}
+
+# Create new column
+df['target'] = df['dx'].map(benign_malignant_dict)
+
+# Preview
+print(df[['dx', 'target']].head())
+
+# Convert 'target' column to string type for ImageDataGenerator
+train_df['target'] = train_df['target'].astype(str)
+val_df['target'] = val_df['target'].astype(str)
+
+print("Train target dtype after conversion:", train_df['target'].dtype)
+print("Validation target dtype after conversion:", val_df['target'].dtype)
+
+plt.figure(figsize=(6,3))
+
+sns.countplot(data=df, x='target')
+
+plt.title("Benign vs Malignant Distribution")
+plt.xlabel("Lesion Type")
+plt.ylabel("Count")
+plt.show()
+
+print(df['target'].value_counts())
+
+"""### Train / Validation split"""
+
+from sklearn.model_selection import train_test_split
+
+train_df, val_df = train_test_split(
+    df,
+    test_size=0.2,
+    stratify=df['target'],
+    random_state=42
+)
+
+print("Train size:", len(train_df))
+print("Validation size:", len(val_df))
+
+"""### Class weight (class imbalance)"""
+
+from sklearn.utils.class_weight import compute_class_weight
+
+classes = np.unique(train_df['target'])
+
+weights = compute_class_weight(
+    class_weight='balanced',
+    classes=classes,
+    y=train_df['target']
+)
+
+class_weights = dict(enumerate(weights))
+
+print(class_weights)
+
+"""## 3. Build the model"""
+
+import tensorflow as tf
+from tensorflow.keras import layers, models
+from tensorflow.keras.applications import DenseNet121
+from tensorflow.keras.applications.densenet import preprocess_input
+from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
+from tensorflow.keras.models import Model
+
+data_augmentation = tf.keras.Sequential([
+    tf.keras.layers.RandomFlip("horizontal"),
+    tf.keras.layers.RandomRotation(0.1),
+    tf.keras.layers.RandomZoom(0.1),
+    tf.keras.layers.RandomContrast(0.1),
+])
+
+base_model = DenseNet121(
+    weights='imagenet',
+    include_top=False,
+    input_shape=(224, 224, 3)
+)
+
+inputs = tf.keras.Input(shape=(224,224,3))
+
+x = data_augmentation(inputs)
+
+x = base_model.output
+x = GlobalAveragePooling2D()(x)
+x = Dense(512, activation='relu')(x) # Added another Dense layer
+x = Dense(256, activation='relu')(x) # Existing Dense layer
+predictions = Dense(1, activation='sigmoid')(x) # Output layer for binary classification
+
+model = Model(inputs=base_model.input, outputs=predictions)
+
+"""## 4. Data Generators
+
+I will prepare data generators for training and validation. The training generator will include data augmentation and preprocessing, while the validation generator will only preprocess the images.
+"""
+
+from tensorflow.keras.preprocessing.image import ImageDataGenerator
+
+# Image dimensions
+IMG_WIDTH = 224
+IMG_HEIGHT = 224
+
+# Data generators
+train_datagen = ImageDataGenerator(
+    preprocessing_function=preprocess_input,
+    rotation_range=20,
+    width_shift_range=0.2,
+    height_shift_range=0.2,
+    shear_range=0.2,
+    zoom_range=0.2,
+    horizontal_flip=True,
+    fill_mode='nearest'
+)
+
+val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
+
+# Flow from dataframe
+train_generator = train_datagen.flow_from_dataframe(
+    dataframe=train_df,
+    x_col='path',
+    y_col='target',
+    target_size=(IMG_WIDTH, IMG_HEIGHT),
+    batch_size=32,
+    class_mode='binary',
+    seed=42
+)
+
+val_generator = val_datagen.flow_from_dataframe(
+    dataframe=val_df,
+    x_col='path',
+    y_col='target',
+    target_size=(IMG_WIDTH, IMG_HEIGHT),
+    batch_size=32,
+    class_mode='binary',
+    seed=42
+)
+
+"""## 5. Compile the Model
+
+I will compile the model using the Adam optimizer, binary cross-entropy loss (suitable for binary classification), and track accuracy as a metric.
+"""
+
+from tensorflow.keras.optimizers import Adam
+
+model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
+
+"""## 6. Train the Model
+
+I will now train the model using the prepared data generators. I'll also add callbacks for early stopping to prevent overfitting and to save the best model.
+"""
+
+from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
+
+# Callbacks
+early_stopping = EarlyStopping(
+    monitor='val_loss',
+    patience=10,
+    restore_best_weights=True
+)
+
+model_checkpoint = ModelCheckpoint(
+    'best_model.keras',
+    monitor='val_accuracy',
+    save_best_only=True,
+    mode='max'
+)
+
+# Train the model
+history = model.fit(
+    train_generator,
+    epochs=3, # You can adjust the number of epochs
+    validation_data=val_generator,
+    callbacks=[early_stopping, model_checkpoint],
+    class_weight=class_weights # Use class weights to handle imbalance
+)
+
+"""## X. Evaluation
+
+### Load best model
+"""
+
+from tensorflow.keras.models import load_model
+
+best_model = load_model('best_model.keras')
+
+"""### Evaluate on validation set"""
+
+loss, accuracy = best_model.evaluate(val_generator)
+print(f"Validation Loss: {loss:.4f}")
+print(f"Validation Accuracy: {accuracy:.4f}")
+
+"""### Predictions and Classification Report"""
+
+from sklearn.metrics import classification_report, confusion_matrix
+
+val_generator.reset() # Reset generator to ensure correct order
+y_pred_probs = best_model.predict(val_generator)
+y_pred = (y_pred_probs > 0.5).astype(int)
+
+y_true = val_generator.classes
+
+print("Classification Report:")
+print(classification_report(y_true, y_pred))
+
+"""### Confusion Matrix"""
+
+cm = confusion_matrix(y_true, y_pred)
+plt.figure(figsize=(8, 6))
+sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant'])
+plt.title('Confusion Matrix')
+plt.xlabel('Predicted Label')
+plt.ylabel('True Label')
+plt.show()
+
+"""### Training History Plots"""
+
+plt.figure(figsize=(12, 5))
+
+# Plot training & validation accuracy values
+plt.subplot(1, 2, 1)
+plt.plot(history.history['accuracy'])
+plt.plot(history.history['val_accuracy'])
+plt.title('Model Accuracy')
+plt.ylabel('Accuracy')
+plt.xlabel('Epoch')
+plt.legend(['Train', 'Validation'], loc='upper left')
+
+# Plot training & validation loss values
+plt.subplot(1, 2, 2)
+plt.plot(history.history['loss'])
+plt.plot(history.history['val_loss'])
+plt.title('Model Loss')
+plt.ylabel('Loss')
+plt.xlabel('Epoch')
+plt.legend(['Train', 'Validation'], loc='upper left')
+
+plt.tight_layout()
+plt.show()