# -*- coding: utf-8 -*-
"""7_nust_digitalimageprocessing_hp2cells.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/19p0R4DLa-9HQqdgIhgfkp4yHjMGKjuxQ
"""

from google.colab import files
uploaded = files.upload()  # Upload your ZIP file manually

import zipfile

zip_path = 'Test.zip'  # After upload

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall()  # ← extracts to /content directly

base_path = '/content/Test'

import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from skimage.measure import label, regionprops
from skimage.filters import threshold_otsu
from skimage.morphology import remove_small_objects

# --- Feature Extraction ---
def extract_basic_features_color(cell_image, mask):
    green_channel = cell_image[:, :, 1]

    if green_channel.shape != mask.shape:
        print(f"❗ Shape mismatch: image {green_channel.shape}, mask {mask.shape}")
        return [0, 0, 0]  # Skip bad sample with dummy features

    masked_pixels = green_channel[mask > 0]
    area = np.sum(mask)
    mean_intensity = np.mean(masked_pixels)
    std_intensity = np.std(masked_pixels)
    return [area, mean_intensity, std_intensity]

import os

print("Sample files in Cells folder:")
print(os.listdir('/content/Test/Cells')[:5])  # adjust path if needed

print("\nSample files in Masks folder:")
print(os.listdir('/content/Test/Masks')[:5])

import pandas as pd

df = pd.read_csv('/content/Test/Segmentation/01.csv', sep=';', encoding='utf-8-sig', dtype={'cell_ID': str})
print(df.columns)
print(df.head())

def extract_features_and_labels(cells_folder, masks_folder, csv_path):
    import pandas as pd
    import os
    import cv2
    import numpy as np

    print("🔍 Reading CSV...")
    df = pd.read_csv(csv_path, sep=';', encoding='utf-8-sig', dtype={'cell_ID': str})
    print("✅ CSV loaded with", len(df), "rows")

    features = []
    labels = []

    skipped = 0
    processed = 0

    for _, row in df.iterrows():
        cell_id = row['cell_ID']  # Already a zero-padded string

        cell_img_path = os.path.join(cells_folder, f'{cell_id}.png')
        mask_img_path = os.path.join(masks_folder, f'{cell_id}.png')

        if not os.path.exists(cell_img_path) or not os.path.exists(mask_img_path):
            print(f"⚠️ Skipping missing: {cell_id}")
            skipped += 1
            continue

        cell_img = cv2.imread(cell_img_path)
        mask = cv2.imread(mask_img_path, 0)

        if cell_img.shape[:2] != mask.shape:
          print(f"⚠️ Skipping due to size mismatch: {cell_id}, image={cell_img.shape}, mask={mask.shape}")
          skipped += 1
          continue


        feats = extract_basic_features_color(cell_img, mask)
        features.append(feats)
        labels.append(row['pattern'])

        processed += 1
        if processed <= 5:
            print(f"✅ Processed: {cell_id}, Features: {feats}")

    print(f"\n✅ Total loaded samples: {len(features)}")
    print(f"❌ Skipped incomplete samples: {skipped}")

    return np.array(features), np.array(labels)

for _, row in df.iterrows():
    cell_id = str(row['cell_ID'])  # Already correct format now

    cell_img_path = os.path.join(cells_folder, f'{cell_id}.png')
    mask_img_path = os.path.join(masks_folder, f'{cell_id}.png')

    print("Looking for:", cell_img_path)
    print("Looking for:", mask_img_path)

    if not os.path.exists(cell_img_path):
        print("❌ Missing image:", cell_img_path)
    else:
        print(f"✅ Loaded: {cell_id}")

    if not os.path.exists(mask_img_path):
        print("❌ Missing mask:", mask_img_path)
    else:
        print(f"✅ Loaded: {cell_id}")

features, labels = extract_features_and_labels(cells_folder, masks_folder, csv_path)

import os
import cv2
import pandas as pd

def generate_cropped_masks(full_mask_path, csv_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    # Load the full segmentation mask (grayscale)
    full_mask = cv2.imread(full_mask_path, cv2.IMREAD_GRAYSCALE)
    if full_mask is None:
        raise ValueError(f"Could not load mask image: {full_mask_path}")

    # Load the CSV
    df = pd.read_csv(csv_path, sep=';', encoding='utf-8-sig', dtype={'cell_ID': str})

    # Loop over each cell and crop the corresponding region from the full mask
    for _, row in df.iterrows():
        cell_id = row['cell_ID']
        minX, minY, maxX, maxY = map(int, [row['minX'], row['minY'], row['maxX'], row['maxY']])

        # Crop the mask to this bounding box
        cropped_mask = full_mask[minY:maxY, minX:maxX]

        # Save the cropped mask as its own file
        save_path = os.path.join(output_folder, f"{cell_id}.png")
        cv2.imwrite(save_path, cropped_mask)

    print(f"✅ {len(df)} masks saved to: {output_folder}")

# Set your paths
full_mask_path = '/content/Test/Segmentation/01_mask.bmp'
csv_path = '/content/Test/Segmentation/01.csv'
output_mask_folder = '/content/Test/Generated_Masks'

# Generate cropped masks
generate_cropped_masks(full_mask_path, csv_path, output_mask_folder)

masks_folder = '/content/Test/Generated_Masks'
features, labels = extract_features_and_labels(cells_folder, masks_folder, csv_path)

import matplotlib.pyplot as plt

test_id = '001'
img = cv2.imread(os.path.join(cells_folder, f'{test_id}.png'))
mask = cv2.imread(os.path.join(masks_folder, f'{test_id}.png'), 0)

plt.subplot(1,2,1)
plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
plt.title("Cell Image")

plt.subplot(1,2,2)
plt.imshow(mask, cmap='gray')
plt.title("Generated Mask")

plt.show()

# --- Classifier Training ---
def train_classifier(features, labels):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    return clf

# --- Segmentation ---
def segment_cells(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh_val = threshold_otsu(gray)
    binary = (gray > thresh_val).astype(np.uint8)
    binary = remove_small_objects(binary.astype(bool), min_size=50)
    labeled = label(binary)
    return labeled

# --- IoU Evaluation ---
def compute_iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    return interArea / float(boxAArea + boxBArea - interArea)

def evaluate_segmentation(labeled_image, csv_df):
    results = []
    for region in regionprops(labeled_image):
        minr, minc, maxr, maxc = region.bbox
        pred_box = [minc, minr, maxc, maxr]  # Format: x1, y1, x2, y2
        for _, row in csv_df.iterrows():
            gt_box = [row['minX'], row['minY'], row['maxX'], row['maxY']]
            iou = compute_iou(pred_box, gt_box)
            if iou > 0.5:
                results.append((pred_box, gt_box, iou))
                break
    print(f"Matched cells: {len(results)} / {len(csv_df)}")
    return results

# --- Main Execution ---

# Define base path to the Test folder
base_path = '/content/Test'  # <--- update this path

cells_folder = os.path.join(base_path, 'Cells')
masks_folder = os.path.join(base_path, 'Generated_Masks')
segmentation_folder = os.path.join(base_path, 'Segmentation')

csv_path = os.path.join(segmentation_folder, '01.csv')
image_path = os.path.join(segmentation_folder, '01.bmp')

# Step 1: Extract features & train classifier
features, labels = extract_features_and_labels(cells_folder, masks_folder, csv_path)
print("Features shape:", features.shape)
print("Labels shape:", labels.shape)

if len(features) == 0:
    raise ValueError("No features were extracted. Check input paths and mask generation.")

clf = train_classifier(features, labels)

# Step 2: Segment full image
full_img = cv2.imread(image_path)
labeled_img = segment_cells(full_img)

# Step 3: Load GT CSV and evaluate segmentation
gt_df = pd.read_csv(csv_path, sep=';')
results = evaluate_segmentation(labeled_img, gt_df)

# Step 4: Visualize
plt.imshow(labeled_img, cmap='nipy_spectral')
plt.title("Segmented Cell Regions")
plt.axis('off')
plt.show()

print("Number of samples:", len(features))
print("Class distribution:", pd.Series(labels).value_counts())

print("Unique labels:", np.unique(labels))

#Start all of it again but this time with 5 patients
import os
import numpy as np
import pandas as pd
import cv2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from skimage.filters import threshold_otsu
from skimage.morphology import remove_small_objects
from skimage.measure import label, regionprops
import matplotlib.pyplot as plt

# --- Feature Extraction ---
def extract_basic_features_color(cell_image, mask):
    green_channel = cell_image[:, :, 1]
    if green_channel.shape != mask.shape:
        print(f"❗ Shape mismatch: image {green_channel.shape}, mask {mask.shape}")
        return [0, 0, 0]
    masked_pixels = green_channel[mask > 0]
    area = np.sum(mask)
    mean_intensity = np.mean(masked_pixels)
    std_intensity = np.std(masked_pixels)
    return [area, mean_intensity, std_intensity]

def extract_features_and_labels(cells_folder, masks_folder, csv_path):
    df = pd.read_csv(csv_path, sep=';', encoding='utf-8-sig', dtype={'cell_ID': str})
    features = []
    labels = []
    skipped = 0

    for _, row in df.iterrows():
        cell_id = row['cell_ID']
        cell_img_path = os.path.join(cells_folder, f'{cell_id}.png')
        mask_img_path = os.path.join(masks_folder, f'{cell_id}.png')

        if not os.path.exists(cell_img_path) or not os.path.exists(mask_img_path):
            skipped += 1
            continue

        cell_img = cv2.imread(cell_img_path)
        mask = cv2.imread(mask_img_path, 0)

        if cell_img is None or mask is None or cell_img.shape[:2] != mask.shape:
            skipped += 1
            continue

        feats = extract_basic_features_color(cell_img, mask)
        features.append(feats)
        labels.append(row['pattern'])

    return np.array(features), np.array(labels)

# --- Mask Cropping ---
def generate_cropped_masks(full_mask_path, csv_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    full_mask = cv2.imread(full_mask_path, cv2.IMREAD_GRAYSCALE)
    if full_mask is None:
        print(f"❌ Error loading mask: {full_mask_path}")
        return

    df = pd.read_csv(csv_path, sep=';', encoding='utf-8-sig', dtype={'cell_ID': str})
    for _, row in df.iterrows():
        cell_id = row['cell_ID']
        minX, minY, maxX, maxY = map(int, [row['minX'], row['minY'], row['maxX'], row['maxY']])
        cropped_mask = full_mask[minY:maxY, minX:maxX]
        save_path = os.path.join(output_folder, f"{cell_id}.png")
        cv2.imwrite(save_path, cropped_mask)

# --- Classifier ---
def train_classifier(features, labels):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
    print("🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    class_names = np.unique(labels)  # assuming labels used during training
    compute_metrics(y_test, y_pred, class_names)
    return clf

from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np

def compute_metrics(y_true, y_pred, class_names=None):
    cm = confusion_matrix(y_true, y_pred, labels=class_names)
    accuracy = accuracy_score(y_true, y_pred)
    print(f"✅ Overall Accuracy: {accuracy:.4f}")

    print("\n📈 Class-wise Sensitivity (Recall) and Specificity:")
    for idx, cls in enumerate(class_names):
        TP = cm[idx, idx]
        FN = np.sum(cm[idx, :]) - TP
        FP = np.sum(cm[:, idx]) - TP
        TN = np.sum(cm) - (TP + FN + FP)
        sensitivity = TP / (TP + FN) if (TP + FN) else 0
        specificity = TN / (TN + FP) if (TN + FP) else 0
        print(f"🔹 Class '{cls}': Sensitivity = {sensitivity:.4f}, Specificity = {specificity:.4f}")

# --- Main Loop Over Patients ---
base_path = '/content/Test'

# Filter for folders named '01', '02', etc.
patients = sorted([f for f in os.listdir(base_path) if f.isdigit()])

all_features = []
all_labels = []

for patient in patients:
    print(f"\n🔍 Processing patient {patient}...")
    patient_path = os.path.join(base_path, patient)
    cells_folder = os.path.join(patient_path, 'Cells')
    seg_folder = os.path.join(patient_path, 'Segmentation')
    csv_path = os.path.join(seg_folder, '01.csv')          # Update if filename varies
    full_mask_path = os.path.join(seg_folder, '01_mask.bmp')
    masks_folder = os.path.join(patient_path, 'Generated_Masks')

    # Step 1: Generate Cropped Masks
    if not os.path.exists(masks_folder) or len(os.listdir(masks_folder)) == 0:
        generate_cropped_masks(full_mask_path, csv_path, masks_folder)

    # Step 2: Extract Features & Labels
    features, labels = extract_features_and_labels(cells_folder, masks_folder, csv_path)

    print(f"✅ Extracted: {len(features)} samples with {len(np.unique(labels))} unique label(s)")
    if len(features) > 0:
        all_features.append(features)
        all_labels.append(labels)

# Combine data from all patients
if all_features:
    features = np.vstack(all_features)
    labels = np.concatenate(all_labels)

    print(f"\n🧪 Total combined samples: {features.shape[0]}")
    print("🧬 Class distribution:", pd.Series(labels).value_counts().to_dict())

    # Train classifier
    clf = train_classifier(features, labels)
else:
    print("❌ No data found across patients.")

#This is after changing the segmentation file name as 01, 02 etc.
# --- Main Loop Over Patients ---
base_path = '/content/Test'

# Folder names like "01", "02", "03", ...
patients = sorted([f for f in os.listdir(base_path) if f.isdigit()])

all_features = []
all_labels = []

for patient in patients:
    print(f"\n🔍 Processing patient {patient}...")
    patient_path = os.path.join(base_path, patient)
    cells_folder = os.path.join(patient_path, 'Cells')
    seg_folder = os.path.join(patient_path, 'Segmentation')

    # Use patient ID in filenames
    csv_path = os.path.join(seg_folder, f'{patient}.csv')
    full_mask_path = os.path.join(seg_folder, f'{patient}_mask.bmp')
    masks_folder = os.path.join(patient_path, 'Generated_Masks')

    # --- Check if required files exist
    if not os.path.exists(csv_path):
        print(f"❌ Missing CSV: {csv_path}, skipping.")
        continue
    if not os.path.exists(full_mask_path):
        print(f"❌ Missing mask: {full_mask_path}, skipping.")
        continue
    if not os.path.exists(cells_folder):
        print(f"❌ Missing Cells folder: {cells_folder}, skipping.")
        continue

    # --- Step 1: Generate Cropped Masks
    if not os.path.exists(masks_folder) or len(os.listdir(masks_folder)) == 0:
        generate_cropped_masks(full_mask_path, csv_path, masks_folder)

    # --- Step 2: Extract Features & Labels
    features, labels = extract_features_and_labels(cells_folder, masks_folder, csv_path)
    print(f"✅ Extracted: {len(features)} samples with {len(np.unique(labels))} unique label(s)")

    if len(features) > 0:
        all_features.append(features)
        all_labels.append(labels)

# --- Combine Data and Train Classifier
if all_features:
    features = np.vstack(all_features)
    labels = np.concatenate(all_labels)

    print(f"\n🧪 Total combined samples: {features.shape[0]}")
    print("🧬 Class distribution:", pd.Series(labels).value_counts().to_dict())

    clf = train_classifier(features, labels)
else:
    print("❌ No valid data found to train classifier.")

#This is for all 28 patients
# --- Main Loop Over Patients ---
base_path = '/content/Test'

# Folder names like "01", "02", "03", ...
patients = sorted([f for f in os.listdir(base_path) if f.isdigit()])

all_features = []
all_labels = []

for patient in patients:
    print(f"\n🔍 Processing patient {patient}...")
    patient_path = os.path.join(base_path, patient)
    cells_folder = os.path.join(patient_path, 'Cells')
    seg_folder = os.path.join(patient_path, 'Segmentation')

    # Use patient ID in filenames
    csv_path = os.path.join(seg_folder, f'{patient}.csv')
    full_mask_path = os.path.join(seg_folder, f'{patient}_mask.bmp')
    masks_folder = os.path.join(patient_path, 'Generated_Masks')

    # --- Check if required files exist
    if not os.path.exists(csv_path):
        print(f"❌ Missing CSV: {csv_path}, skipping.")
        continue
    if not os.path.exists(full_mask_path):
        print(f"❌ Missing mask: {full_mask_path}, skipping.")
        continue
    if not os.path.exists(cells_folder):
        print(f"❌ Missing Cells folder: {cells_folder}, skipping.")
        continue

    # --- Step 1: Generate Cropped Masks
    if not os.path.exists(masks_folder) or len(os.listdir(masks_folder)) == 0:
        generate_cropped_masks(full_mask_path, csv_path, masks_folder)

    # --- Step 2: Extract Features & Labels
    features, labels = extract_features_and_labels(cells_folder, masks_folder, csv_path)
    print(f"✅ Extracted: {len(features)} samples with {len(np.unique(labels))} unique label(s)")

    if len(features) > 0:
        all_features.append(features)
        all_labels.append(labels)

# --- Combine Data and Train Classifier
if all_features:
    features = np.vstack(all_features)
    labels = np.concatenate(all_labels)

    print(f"\n🧪 Total combined samples: {features.shape[0]}")
    print("🧬 Class distribution:", pd.Series(labels).value_counts().to_dict())

    clf = train_classifier(features, labels)
else:
    print("❌ No valid data found to train classifier.")

def plot_feature_importance(model, feature_names):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize=(6, 4))
    sns.barplot(x=importances[indices], y=[feature_names[i] for i in indices], palette='magma')
    plt.title('🌟 Feature Importance')
    plt.xlabel('Importance Score')
    plt.tight_layout()
    plt.show()

feature_names = ['Area', 'Mean Green Intensity', 'Std Dev Green Intensity']
plot_feature_importance(clf, feature_names)

def plot_class_distribution(labels):
    label_counts = pd.Series(labels).value_counts()
    plt.figure(figsize=(12, 6))
    sns.barplot(x=label_counts.index, y=label_counts.values, palette='viridis')
    plt.title("📊 Class Distribution")
    plt.xlabel("Class Label")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

plot_class_distribution(labels)