import kagglehub
import numpy as np
import os
from pathlib import Path
import cv2
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from skimage.feature import hog
from skimage.filters import sobel
import xgboost as xgb

# Download dataset
path = kagglehub.dataset_download("lsind18/gemstones-images")
print("Path to dataset files:", path)

# Set paths
train_dataset_path = Path(path) / "train"
test_dataset_path = Path(path) / "test"

# ------------------- Preprocessing -------------------
def preprocess_image(image_path):
    image = cv2.imread(str(image_path))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.GaussianBlur(image, (5, 5), 0)
    lab = cv2.cvtColor(image, cv2.COLOR_RGB2LAB)
    l, a, b = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=5.0, tileGridSize=(10, 10))
    l = clahe.apply(l)
    lab = cv2.merge((l, a, b))
    image = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)
    image = cv2.resize(image, (128, 128))
    return image

# ------------------- Gabor Kernel Bank -------------------
def build_gabor_kernels():
    kernels = []
    ksize = 31
    for theta in np.arange(0, np.pi, np.pi / 4):
        for sigma in (1, 3):
            for lamda in (np.pi / 4, np.pi / 2):
                kernel = cv2.getGaborKernel((ksize, ksize), sigma, theta, lamda, 0.5, 0, ktype=cv2.CV_32F)
                kernels.append(kernel)
    return kernels

gabor_kernels = build_gabor_kernels()

# ------------------- Feature Extraction -------------------
sift = cv2.SIFT_create()

def extract_features(image):
    features = []

    image_resized = cv2.resize(image, (128, 128))

    # Color histograms
    for i in range(3):
        hist = cv2.calcHist([image_resized], [i], None, [64], [0, 256])
        hist = hist.flatten() / (np.sum(hist) + 1e-5)
        features.extend(hist)

    # HOG
    hog_feat = hog(image_resized, orientations=9, pixels_per_cell=(8, 8),
                   cells_per_block=(2, 2), visualize=False, channel_axis=2)
    features.extend(hog_feat)

    # Harris Corners
    gray = cv2.cvtColor(image_resized, cv2.COLOR_RGB2GRAY)
    corners = cv2.cornerHarris(np.float32(gray), 2, 3, 0.04)
    corner_count = np.sum(corners > 0.01 * corners.max())
    features.append(corner_count)

    # Sobel Edge Mean
    sobel_img = sobel(gray)
    features.append(np.mean(sobel_img))

    # SIFT (mean + std of descriptors)
    kp, des = sift.detectAndCompute(gray, None)
    if des is not None:
        des_mean = np.mean(des, axis=0)
        des_std = np.std(des, axis=0)
        features.extend(des_mean)
        features.extend(des_std)
    else:
        features.extend([0]*128)  # mean
        features.extend([0]*128)  # std

    # Gabor filter responses (mean)
    for kernel in gabor_kernels:
        fimg = cv2.filter2D(gray, cv2.CV_8UC3, kernel)
        features.append(np.mean(fimg))

    return features

# ------------------- Load Dataset -------------------
def load_dataset(dataset_path):
    all_features = []
    all_labels = []

    for gemstone_folder in os.listdir(dataset_path):
        gemstone_path = dataset_path / gemstone_folder
        if gemstone_path.is_dir():
            image_files = list(gemstone_path.glob("**/*.jpg"))
            print(f"📂 {gemstone_folder}: {len(image_files)} images")
            for image_path in image_files:
                try:
                    image = preprocess_image(image_path)
                    features = extract_features(image)
                    all_features.append(features)
                    all_labels.append(gemstone_folder)
                except Exception as e:
                    print(f"⚠️ Error processing {image_path}: {e}")
    return all_features, all_labels

print("\n📥 Loading training data...")
train_features, train_labels = load_dataset(train_dataset_path)

print("\n📥 Loading test data...")
test_features, test_labels = load_dataset(test_dataset_path)

# ------------------- Encode Labels -------------------
le = LabelEncoder()
y_train = le.fit_transform(train_labels)
y_test = le.transform(test_labels)

# ------------------- Create DataFrames -------------------
X_train = pd.DataFrame(train_features)
X_test = pd.DataFrame(test_features)

print("\n📊 Features DataFrame shape:", X_train.shape)

# ------------------- Optional Train/Val Split -------------------
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# ------------------- Train XGBoost -------------------
clf = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),
    eval_metric='mlogloss',
    use_label_encoder=False,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

print("\n⚙️ Training XGBoost...")
clf.fit(X_tr, y_tr)  # No early stopping due to version limit

# ------------------- Evaluate -------------------
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"\n✅ Accuracy on test set: {acc * 100:.2f}%")
print("\n📄 Classification Report:\n", classification_report(le.inverse_transform(y_test), le.inverse_transform(y_pred)))