This commit is contained in:
2026-05-25 20:25:44 +02:00
parent b2c0110d11
commit 9dc29f19e4

View File

@@ -57,7 +57,10 @@ from PIL import Image
# Posted by korakot, modified by community. See post 'Timeline' for change history # Posted by korakot, modified by community. See post 'Timeline' for change history
# Retrieved 2026-05-21, License - CC BY-SA 4.0 # Retrieved 2026-05-21, License - CC BY-SA 4.0
"""### Loading dataset""" """### Loading dataset
We start loading the CSV dataset into a dataframe:
"""
# Path to your dataset folder # Path to your dataset folder
dataset_path = path dataset_path = path
@@ -71,6 +74,8 @@ df = pd.read_csv(metadata_path)
# Show first rows # Show first rows
print(df.head()) print(df.head())
"""Here we load all JPG images (path format) from the dataset to the dataframe:"""
# Collect all image paths # Collect all image paths
image_paths = glob.glob(os.path.join(dataset_path, "**", "*.jpg"), recursive=True) image_paths = glob.glob(os.path.join(dataset_path, "**", "*.jpg"), recursive=True)
@@ -91,24 +96,32 @@ print(df[['image_id', 'path']].head())
"""## 3. Dataset analysis """## 3. Dataset analysis
### Missingness ### Missingness
The dataset does not have any missing cell:
""" """
missing = df['path'].isnull().sum() missing = df['path'].isnull().sum()
print(f"Missing images: {missing}") print(f"Missing images: {missing}")
"""### Class distribution""" """### Class distribution
If we analyze the distribution of lesion type, we find that there is a clear unbalance. _Melanocytic nevi_ is the most abundant class.
"""
plt.figure(figsize=(10,5)) plt.figure(figsize=(10,5))
sns.countplot(data=df, x='dx', order=df['dx'].value_counts().index) sns.countplot(data=df, x='dx', order=df['dx'].value_counts().index)
plt.title("Class Distribution") plt.title("Lesion type distribution")
plt.xlabel("Diagnosis") plt.xlabel("Lesion type")
plt.ylabel("Count") plt.ylabel("Count")
plt.show() plt.show()
"""### Visualize samples""" """### Visualize samples
We visualize some random samples of the dataset, adding on the top the lesion type they belong:
"""
fig, axes = plt.subplots(2, 4, figsize=(10,5)) fig, axes = plt.subplots(2, 4, figsize=(10,5))
@@ -124,7 +137,12 @@ for i, ax in enumerate(axes.flat):
plt.tight_layout() plt.tight_layout()
plt.show() plt.show()
"""### Data distribution""" """### Data distribution
It is interesting to study the distribution of other features in the dataset, in order to detect outliers.
We start by the age, plotting it in 20 bins. First we find that most abudnant age is arround 50 years old (from 40 to 58). One surprise is that we detect zero samples in two bins, which correspond to the ages from 55 to 60 years and 25 to 30 years old.
"""
plt.figure(figsize=(8,5)) plt.figure(figsize=(8,5))
@@ -133,11 +151,15 @@ sns.histplot(df['age'].dropna(), bins=20)
plt.title("Age Distribution") plt.title("Age Distribution")
plt.show() plt.show()
"""However, sex distribution is quite balanced. Even so, there are more males than females."""
sns.countplot(data=df, x='sex') sns.countplot(data=df, x='sex')
plt.title("Sex Distribution") plt.title("Sex Distribution")
plt.show() plt.show()
"""Lesion localization is not equal probable in all parts of the body. Instead, back and lower extremity are the most common for skin lesions."""
plt.figure(figsize=(12,5)) plt.figure(figsize=(12,5))
sns.countplot( sns.countplot(
@@ -150,7 +172,10 @@ plt.xticks(rotation=45)
plt.title("Lesion Localization") plt.title("Lesion Localization")
plt.show() plt.show()
"""### Image sizes""" """### Image sizes
The size of the images is 600 x 450 pixels.
"""
sizes = [] sizes = []
@@ -162,7 +187,11 @@ print(pd.Series(sizes).value_counts())
"""## 2. Prepare dataset """## 2. Prepare dataset
In this section, we process the previous dataframe to make it valid for the model training.
### Encoding Binary Labels ### Encoding Binary Labels
Our goal is to classify between benign and malignant. Then, we should encode the lesion type to a new feature describing both states. This new feature is numerical: 0 means benign and 1, malignant.
""" """
# Mapping from dx to benign/malignant # Mapping from dx to benign/malignant
@@ -184,17 +213,32 @@ df['target'] = df['dx'].map(benign_malignant_dict)
# Preview # Preview
print(df[['dx', 'target']].head()) print(df[['dx', 'target']].head())
"""### Dataset split
We split the dataset into training (70%) and validation (15%) and test (15%) sets.
"""
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split( train_val_df, test_df = train_test_split(
df, df,
test_size=0.2, test_size=0.15,
stratify=df['target'], random_state=42,
random_state=42 stratify=df['target']
)
train_df, val_df = train_test_split(
train_val_df,
test_size=0.1765, # ≈ 15% of total dataset
random_state=42,
stratify=train_val_df['target']
) )
print("Train size:", len(train_df)) print("Train size:", len(train_df))
print("Validation size:", len(val_df)) print("Validation size:", len(val_df))
print("Test size:", len(test_df))
"""We convert to string format the encoded column for ImageDataGenerator."""
# Convert 'target' column to string type for ImageDataGenerator # Convert 'target' column to string type for ImageDataGenerator
train_df['target'] = train_df['target'].astype(str) train_df['target'] = train_df['target'].astype(str)
@@ -203,48 +247,62 @@ val_df['target'] = val_df['target'].astype(str)
print("Train target dtype after conversion:", train_df['target'].dtype) print("Train target dtype after conversion:", train_df['target'].dtype)
print("Validation target dtype after conversion:", val_df['target'].dtype) print("Validation target dtype after conversion:", val_df['target'].dtype)
"""### Balancing classes by oversampling the minority class""" """### Balancing classes by oversampling the minority class
# Identify majority and minority classes in the training set We decide to handle class unbalance through three approaches:
class_counts = train_df['target'].value_counts() * Minority class oversampling
majority_class = class_counts.idxmax() * Class weight
minority_class = class_counts.idxmin() * Data augmentation
"""
# Get the DataFrames for majority and minority classes ## Identify majority and minority classes in the training set
df_majority = train_df[train_df['target'] == majority_class] #class_counts = train_df['target'].value_counts()
df_minority = train_df[train_df['target'] == minority_class] #majority_class = class_counts.idxmax()
#minority_class = class_counts.idxmin()
# Oversample the minority class ## Get the DataFrames for majority and minority classes
df_minority_oversampled = df_minority.sample( #df_majority = train_df[train_df['target'] == majority_class]
class_counts[majority_class], replace=True, random_state=42 #df_minority = train_df[train_df['target'] == minority_class]
)
# Combine majority class with oversampled minority class ## Oversample the minority class
train_df_balanced = pd.concat([df_majority, df_minority_oversampled]) #df_minority_oversampled = df_minority.sample(
# class_counts[majority_class], replace=True, random_state=42
#)
# Shuffle the balanced DataFrame ## Combine majority class with oversampled minority class
train_df_balanced = train_df_balanced.sample(frac=1, random_state=42).reset_index(drop=True) #train_df_balanced = pd.concat([df_majority, df_minority_oversampled])
print("Original train_df class distribution:") ## Shuffle the balanced DataFrame
print(train_df['target'].value_counts()) #train_df_balanced = train_df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
print("\nBalanced train_df_balanced class distribution:")
print(train_df_balanced['target'].value_counts()) #print("Original train_df class distribution:")
#print(train_df['target'].value_counts())
#print("\nBalanced train_df_balanced class distribution:")
#print(train_df_balanced['target'].value_counts())
plt.figure(figsize=(6,3)) plt.figure(figsize=(6,3))
sns.countplot(data=df, x='target') sns.countplot(data=train_df, x='target')
plt.title("Benign vs Malignant Distribution") plt.title("Benign vs Malignant Distribution")
plt.xlabel("Lesion Type") plt.xlabel("Lesion Type")
plt.ylabel("Count") plt.ylabel("Count")
plt.show() plt.show()
print(df['target'].value_counts()) print(train_df['target'].value_counts())
"""### Train / Validation split #plt.figure(figsize=(6,3))
### Class weight (class imbalance) #sns.countplot(data=train_df_balanced, x='target')
"""
#plt.title("Benign vs Malignant Distribution")
#plt.xlabel("Lesion Type")
#plt.ylabel("Count")
#plt.show()
#print(train_df_balanced['target'].value_counts())
"""### Class weight (class imbalance)"""
from sklearn.utils.class_weight import compute_class_weight from sklearn.utils.class_weight import compute_class_weight
@@ -292,7 +350,7 @@ inputs = tf.keras.Input(shape=(224,224,3))
x = data_augmentation(inputs) x = data_augmentation(inputs)
x = base_model.output x = base_model(x)
x = GlobalAveragePooling2D()(x) x = GlobalAveragePooling2D()(x)
x = Dense(512, activation='relu')(x) # Added another Dense layer x = Dense(512, activation='relu')(x) # Added another Dense layer
x = Dense(256, activation='relu')(x) # Existing Dense layer x = Dense(256, activation='relu')(x) # Existing Dense layer
@@ -347,50 +405,10 @@ val_generator = val_datagen.flow_from_dataframe(
seed=42 seed=42
) )
"""Now, the `train_generator` will use the `train_df_balanced` DataFrame, which has an equal number of samples for both classes. This will help the model learn more effectively from the minority class during training.""" """Now, the `train_generator` will use the `train_df_balanced` DataFrame, which has an equal number of samples for both classes. This will help the model learn more effectively from the minority class during training.
from tensorflow.keras.preprocessing.image import ImageDataGenerator ## 6. Train the Model
"""
# Image dimensions
IMG_WIDTH = 224
IMG_HEIGHT = 224
# Data generators
train_datagen = ImageDataGenerator(
preprocessing_function=preprocess_input,
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest'
)
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
# Flow from balanced dataframe for training
train_generator = train_datagen.flow_from_dataframe(
dataframe=train_df_balanced, # Use the balanced DataFrame
x_col='path',
y_col='target',
target_size=(IMG_WIDTH, IMG_HEIGHT),
batch_size=32,
class_mode='binary',
seed=42
)
val_generator = val_datagen.flow_from_dataframe(
dataframe=val_df,
x_col='path',
y_col='target',
target_size=(IMG_WIDTH, IMG_HEIGHT),
batch_size=32,
class_mode='binary',
seed=42
)
"""## 6. Train the Model"""
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
@@ -404,34 +422,51 @@ model_checkpoint = ModelCheckpoint(
# Train the model # Train the model
history = model.fit( history = model.fit(
train_generator, train_generator,
epochs=50, # You can adjust the number of epochs epochs=30, # You can adjust the number of epochs
validation_data=val_generator, validation_data=val_generator,
callbacks=[model_checkpoint], callbacks=[model_checkpoint],
class_weight=class_weights # Use class weights to handle imbalance class_weight=class_weights # Use class weights to handle imbalance
) )
"""## 7. Evaluation """## 7. Evaluation"""
### Load best model IMG_WIDTH = 224
""" IMG_HEIGHT = 224
test_datagen = ImageDataGenerator(
preprocessing_function=preprocess_input
)
test_generator = test_datagen.flow_from_dataframe(
dataframe=test_df,
x_col='path',
y_col='target',
target_size=(IMG_WIDTH, IMG_HEIGHT),
batch_size=32,
class_mode='binary',
shuffle=False # CRITICAL
)
"""### Load best model"""
from tensorflow.keras.models import load_model from tensorflow.keras.models import load_model
best_model = load_model('best_model.keras') best_model = load_model('best_model.keras')
"""### Evaluate on validation set""" """### Evaluate on test set"""
loss, accuracy = best_model.evaluate(val_generator) results = best_model.evaluate(test_df)
print(f"Validation Loss: {loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}") for name, value in zip(best_model.metrics_names, results):
print(f"{name}: {value:.4f}")
"""### Predictions and Classification Report""" """### Predictions and Classification Report"""
from sklearn.metrics import classification_report, confusion_matrix from sklearn.metrics import classification_report, confusion_matrix
val_generator.reset() # Reset generator to ensure correct order test_generator.reset()
y_pred_probs = best_model.predict(val_generator) y_prob = best_model.predict(test_generator)
y_pred = (y_pred_probs > 0.5).astype(int) y_pred = (y_prob > 0.5).astype(int).ravel()
y_true = val_generator.classes y_true = val_generator.classes
@@ -440,14 +475,55 @@ print(classification_report(y_true, y_pred))
"""### Confusion Matrix""" """### Confusion Matrix"""
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
cm = confusion_matrix(y_true, y_pred) cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant']) plt.figure(figsize=(6,5))
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label') sns.heatmap(
plt.ylabel('True Label') cm,
annot=True,
fmt='d',
cmap='Blues',
xticklabels=["Benign", "Malignant"],
yticklabels=["Benign", "Malignant"]
)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Test Confusion Matrix")
plt.show() plt.show()
"""### Classification report"""
from sklearn.metrics import classification_report
print(classification_report(
y_true,
y_pred,
target_names=["Benign", "Malignant"]
))
"""### ROC-AUC"""
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_true, y_prob)
print(f"ROC-AUC: {auc:.4f}")
"""### Recall"""
from sklearn.metrics import recall_score
sensitivity = recall_score(y_true, y_pred)
print(f"Sensitivity: {sensitivity:.4f}")
"""### Training History Plots""" """### Training History Plots"""
plt.figure(figsize=(12, 5)) plt.figure(figsize=(12, 5))