From d7b39f6c9f9327ca409f0b327ce45934275d8984 Mon Sep 17 00:00:00 2001 From: BinarySandia04 Date: Thu, 21 May 2026 13:45:42 +0200 Subject: [PATCH] First test --- run.sh | 24 +- skin_cancer_classification.py | 410 ++++++++++++++++++++++++++++++++++ 2 files changed, 431 insertions(+), 3 deletions(-) create mode 100644 skin_cancer_classification.py diff --git a/run.sh b/run.sh index 8d60d1a..3143f52 100644 --- a/run.sh +++ b/run.sh @@ -1,5 +1,23 @@ #!/bin/bash -echo "Hello!" -sleep 15 -echo "Goodbye!" +set -e + +VENV_DIR=".venv" + +# 1. Create venv if needed +if [[ ! -d "$VENV_DIR" ]]; then + echo "Creating virtual environment..." + python3 -m venv "$VENV_DIR" +fi + +# 2. Activate venv +source "$VENV_DIR/bin/activate" + +# 3. Install dependencies (lightweight, safe to re-run) +echo "Installing dependencies..." +pip install --upgrade pip +pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow +pip install --upgrade kagglehub[pandas-datasets,hf-datasets] + +python3 skin_cancer_classification.py +echo "Done." diff --git a/skin_cancer_classification.py b/skin_cancer_classification.py new file mode 100644 index 0000000..7d02f0f --- /dev/null +++ b/skin_cancer_classification.py @@ -0,0 +1,410 @@ +# -*- coding: utf-8 -*- +"""Skin Cancer Classification.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1Nhk-lK1OCihOAxo36qt2e1_WgdLc-EiR + +# Skin Cancer Classification +**Output**: Benign / Malignant + +**Model**: DenseNet121 (transfer learning) + +**Dataset**: Skin Cancer: HAM10000 dataset https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000/data + +| Class | Name | Value | +|-------|-------------------------------------|-----------| +| nv | Melanocytic nevi (moles) | Benign | +| bcc | Basal cell carcinoma | Malignant | +| bkl | Benign keratosis-like lesions | Benign | +| df | Dermatofibroma | Benign | +| vasc | Vascular lesions | Benign | +| mel | Melanoma | Malignant | +| akiec | Actinic keratoses / Bowen's disease | Benign | + +## 0. Requirements +""" + +# !pip install pandas numpy matplotlib seaborn pillow scikit-learn tensorflow + +"""## 1. Download dataset""" + +# !pip install --upgrade kagglehub[pandas-datasets,hf-datasets] + +import kagglehub + +# Download latest version +path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000") + +print("Path to dataset files:", path) + +# !cp -R /kaggle/input/skin-cancer-mnist-ham10000 /content/skin-cancer-mnist-ham10000 + +"""## 2. Imports and setup""" + +import os +import glob +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +from PIL import Image + +"""### Loading dataset""" + +# Path to your dataset folder +dataset_path = "/kaggle/input/skin-cancer-mnist-ham10000" + +# Metadata file +metadata_path = os.path.join(dataset_path, "HAM10000_metadata.csv") + +# Load CSV +df = pd.read_csv(metadata_path) + +# Show first rows +print(df.head()) + +# Collect all image paths +image_paths = glob.glob(os.path.join(dataset_path, "**", "*.jpg"), recursive=True) + +# Create dictionary: +# key = image_id +# value = full image path +imageid_path_dict = { + os.path.splitext(os.path.basename(x))[0]: x + for x in image_paths +} + +# Add image path column +df['path'] = df['image_id'].map(imageid_path_dict.get) + +# Check +print(df[['image_id', 'path']].head()) + +"""## 3. Dataset analysis + +### Missingness +""" + +missing = df['path'].isnull().sum() + +print(f"Missing images: {missing}") + +"""### Class distribution""" + +plt.figure(figsize=(10,5)) + +sns.countplot(data=df, x='dx', order=df['dx'].value_counts().index) + +plt.title("Class Distribution") +plt.xlabel("Diagnosis") +plt.ylabel("Count") +plt.show() + +"""### Visualize samples""" + +fig, axes = plt.subplots(2, 4, figsize=(10,5)) + +for i, ax in enumerate(axes.flat): + sample = df.sample(1).iloc[0] + + img = Image.open(sample['path']) + + ax.imshow(img) + ax.set_title(sample['dx']) + ax.axis('off') + +plt.tight_layout() +plt.show() + +"""### Data distribution""" + +plt.figure(figsize=(8,5)) + +sns.histplot(df['age'].dropna(), bins=20) + +plt.title("Age Distribution") +plt.show() + +sns.countplot(data=df, x='sex') + +plt.title("Sex Distribution") +plt.show() + +plt.figure(figsize=(12,5)) + +sns.countplot( + data=df, + x='localization', + order=df['localization'].value_counts().index +) + +plt.xticks(rotation=45) +plt.title("Lesion Localization") +plt.show() + +"""### Image sizes""" + +sizes = [] + +for path in df['path'].sample(100): + img = Image.open(path) + sizes.append(img.size) + +print(pd.Series(sizes).value_counts()) + +"""## 2. Prepare dataset + +### Encoding Binary Labels +""" + +# Mapping from dx to benign/malignant +# 0 --> benign +# 1 --> malignant +benign_malignant_dict = { + 'nv': 0, + 'bcc': 1, + 'bkl': 0, + 'df': 0, + 'vasc': 0, + 'mel': 1, + 'akiec': 0 +} + +# Create new column +df['target'] = df['dx'].map(benign_malignant_dict) + +# Preview +print(df[['dx', 'target']].head()) + +# Convert 'target' column to string type for ImageDataGenerator +train_df['target'] = train_df['target'].astype(str) +val_df['target'] = val_df['target'].astype(str) + +print("Train target dtype after conversion:", train_df['target'].dtype) +print("Validation target dtype after conversion:", val_df['target'].dtype) + +plt.figure(figsize=(6,3)) + +sns.countplot(data=df, x='target') + +plt.title("Benign vs Malignant Distribution") +plt.xlabel("Lesion Type") +plt.ylabel("Count") +plt.show() + +print(df['target'].value_counts()) + +"""### Train / Validation split""" + +from sklearn.model_selection import train_test_split + +train_df, val_df = train_test_split( + df, + test_size=0.2, + stratify=df['target'], + random_state=42 +) + +print("Train size:", len(train_df)) +print("Validation size:", len(val_df)) + +"""### Class weight (class imbalance)""" + +from sklearn.utils.class_weight import compute_class_weight + +classes = np.unique(train_df['target']) + +weights = compute_class_weight( + class_weight='balanced', + classes=classes, + y=train_df['target'] +) + +class_weights = dict(enumerate(weights)) + +print(class_weights) + +"""## 3. Build the model""" + +import tensorflow as tf +from tensorflow.keras import layers, models +from tensorflow.keras.applications import DenseNet121 +from tensorflow.keras.applications.densenet import preprocess_input +from tensorflow.keras.layers import Dense, GlobalAveragePooling2D +from tensorflow.keras.models import Model + +data_augmentation = tf.keras.Sequential([ + tf.keras.layers.RandomFlip("horizontal"), + tf.keras.layers.RandomRotation(0.1), + tf.keras.layers.RandomZoom(0.1), + tf.keras.layers.RandomContrast(0.1), +]) + +base_model = DenseNet121( + weights='imagenet', + include_top=False, + input_shape=(224, 224, 3) +) + +inputs = tf.keras.Input(shape=(224,224,3)) + +x = data_augmentation(inputs) + +x = base_model.output +x = GlobalAveragePooling2D()(x) +x = Dense(512, activation='relu')(x) # Added another Dense layer +x = Dense(256, activation='relu')(x) # Existing Dense layer +predictions = Dense(1, activation='sigmoid')(x) # Output layer for binary classification + +model = Model(inputs=base_model.input, outputs=predictions) + +"""## 4. Data Generators + +I will prepare data generators for training and validation. The training generator will include data augmentation and preprocessing, while the validation generator will only preprocess the images. +""" + +from tensorflow.keras.preprocessing.image import ImageDataGenerator + +# Image dimensions +IMG_WIDTH = 224 +IMG_HEIGHT = 224 + +# Data generators +train_datagen = ImageDataGenerator( + preprocessing_function=preprocess_input, + rotation_range=20, + width_shift_range=0.2, + height_shift_range=0.2, + shear_range=0.2, + zoom_range=0.2, + horizontal_flip=True, + fill_mode='nearest' +) + +val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input) + +# Flow from dataframe +train_generator = train_datagen.flow_from_dataframe( + dataframe=train_df, + x_col='path', + y_col='target', + target_size=(IMG_WIDTH, IMG_HEIGHT), + batch_size=32, + class_mode='binary', + seed=42 +) + +val_generator = val_datagen.flow_from_dataframe( + dataframe=val_df, + x_col='path', + y_col='target', + target_size=(IMG_WIDTH, IMG_HEIGHT), + batch_size=32, + class_mode='binary', + seed=42 +) + +"""## 5. Compile the Model + +I will compile the model using the Adam optimizer, binary cross-entropy loss (suitable for binary classification), and track accuracy as a metric. +""" + +from tensorflow.keras.optimizers import Adam + +model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy']) + +"""## 6. Train the Model + +I will now train the model using the prepared data generators. I'll also add callbacks for early stopping to prevent overfitting and to save the best model. +""" + +from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint + +# Callbacks +early_stopping = EarlyStopping( + monitor='val_loss', + patience=10, + restore_best_weights=True +) + +model_checkpoint = ModelCheckpoint( + 'best_model.keras', + monitor='val_accuracy', + save_best_only=True, + mode='max' +) + +# Train the model +history = model.fit( + train_generator, + epochs=3, # You can adjust the number of epochs + validation_data=val_generator, + callbacks=[early_stopping, model_checkpoint], + class_weight=class_weights # Use class weights to handle imbalance +) + +"""## X. Evaluation + +### Load best model +""" + +from tensorflow.keras.models import load_model + +best_model = load_model('best_model.keras') + +"""### Evaluate on validation set""" + +loss, accuracy = best_model.evaluate(val_generator) +print(f"Validation Loss: {loss:.4f}") +print(f"Validation Accuracy: {accuracy:.4f}") + +"""### Predictions and Classification Report""" + +from sklearn.metrics import classification_report, confusion_matrix + +val_generator.reset() # Reset generator to ensure correct order +y_pred_probs = best_model.predict(val_generator) +y_pred = (y_pred_probs > 0.5).astype(int) + +y_true = val_generator.classes + +print("Classification Report:") +print(classification_report(y_true, y_pred)) + +"""### Confusion Matrix""" + +cm = confusion_matrix(y_true, y_pred) +plt.figure(figsize=(8, 6)) +sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant']) +plt.title('Confusion Matrix') +plt.xlabel('Predicted Label') +plt.ylabel('True Label') +plt.show() + +"""### Training History Plots""" + +plt.figure(figsize=(12, 5)) + +# Plot training & validation accuracy values +plt.subplot(1, 2, 1) +plt.plot(history.history['accuracy']) +plt.plot(history.history['val_accuracy']) +plt.title('Model Accuracy') +plt.ylabel('Accuracy') +plt.xlabel('Epoch') +plt.legend(['Train', 'Validation'], loc='upper left') + +# Plot training & validation loss values +plt.subplot(1, 2, 2) +plt.plot(history.history['loss']) +plt.plot(history.history['val_loss']) +plt.title('Model Loss') +plt.ylabel('Loss') +plt.xlabel('Epoch') +plt.legend(['Train', 'Validation'], loc='upper left') + +plt.tight_layout() +plt.show()