import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import joblib
import matplotlib.pyplot as plt

def run_kmeans_pipeline():
    client = MongoClient('mongodb+srv://2022490404:Z35PZu5dwkleDNcw@cluster0.zrso6e7.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')
    db = client['fyp']
    source_collection = db['preprocessData']
    target_collection = db['kmeans']

    print("Loading data from MongoDB...")
    data = list(source_collection.find())
    df = pd.DataFrame(data)
    df.drop(columns=['_id'], inplace=True, errors='ignore')

    print("Applying feature engineering...")
    df['discount_rate'] = (df['price_ori'] - df['price_actual']) / df['price_ori']
    df['log_total_sold'] = np.log1p(df['total_sold'])
    df['seller_name_freq'] = df['seller_name'].map(df['seller_name'].value_counts()) if 'seller_name' in df else 0

    print("Combining text fields...")
    df['text_combined'] = (
        df['title'] + ' ' +
        df['main_category_original'] + ' ' +
        df['sub_category_1_original'] + ' ' +
        df['sub_category_2_original']
    ).str.lower().fillna('')

    tfidf = TfidfVectorizer(max_features=500, ngram_range=(1, 2), stop_words='english')
    X_text = tfidf.fit_transform(df['text_combined']).toarray()

    print("Scaling numeric features...")
    numeric_features = [
        'price_actual', 'item_rating', 'total_rating', 'log_total_sold',
        'main_category_freq', 'sub_category_1_freq', 'sub_category_2_freq',
        'discount_rate', 'seller_name_freq'
    ]
    X_numeric = df[numeric_features].fillna(0)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_numeric)

    X_combined = np.hstack((X_scaled * 2.0, X_text * 0.3))

    print("Applying PCA for dimensionality reduction...")
    pca = PCA(n_components=0.95)
    X_final = pca.fit_transform(X_combined)
    df = df.iloc[:X_final.shape[0]].reset_index(drop=True)

    print("Evaluating Silhouette Scores for k=2 to 6...")
    scores = []
    for k in range(2, 7):
        km = KMeans(n_clusters=k, random_state=42)
        labels = km.fit_predict(X_final)
        score = silhouette_score(X_final, labels)
        print(f"k={k}, Silhouette Score={score:.4f}")
        scores.append(score)

    plt.plot(range(2, 7), scores, marker='o')
    plt.title('Silhouette Score by Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.grid(True)
    plt.show()

    print("Running final KMeans clustering with k=2...")
    kmeans = KMeans(n_clusters=2, random_state=42)
    df['cluster'] = kmeans.fit_predict(X_final)
    final_score = silhouette_score(X_final, df['cluster'])
    print(f"Final Silhouette Score: {final_score:.4f}")

    print("Visualizing clusters with PCA...")
    pca_vis = PCA(n_components=2)
    X_vis = pca_vis.fit_transform(X_final)
    df['pca1'], df['pca2'] = X_vis[:, 0], X_vis[:, 1]

    # Score each product individually (Premium logic: high price, high rating, low sold)
    df['premium_score'] = (
            df['price_actual'] * 0.9 +  # prioritize expensive products
            df['item_rating'] * 0.2 -  # high rating is still good
            df['total_sold'] * 0.5  # penalize high sales (common = not premium)
    )

    # Set threshold — top 15% highest premium_score = Premium Pick
    threshold = df['premium_score'].quantile(0.75)

    # Assign labels individually
    df['cluster_label'] = df['premium_score'].apply(
        lambda x: 'Premium Pick' if x >= threshold else 'Smart Buy'
    )

    display_df = pd.DataFrame(
        list(db['products_clean'].find({}, {
            '_id': 0, 'id': 1, 'price_actual': 1,
            'item_rating': 1, 'total_sold': 1, 'seller_name': 1
        }))
    )
    df = df.merge(display_df, on='id', how='left', suffixes=('', '_original'))

    df['price_actual_display'] = df['price_actual_original'].apply(
        lambda x: f"RM{float(x):.2f}" if pd.notnull(x) and np.isscalar(x) else None
    )

    df['item_rating_display'] = df['item_rating_original']
    df['total_sold_display'] = df['total_sold_original']
    df = df[df['price_actual_display'].notnull()]

    print("Saving model artifacts locally...")
    joblib.dump(kmeans, 'kmeans_model.pkl')
    joblib.dump(scaler, 'scaler.pkl')
    joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

    print("Updating MongoDB with new clustered data...")
    target_collection.delete_many({})
    target_collection.insert_many(df.to_dict(orient='records'))
    print(f"Inserted {len(df)} documents to 'kmeans'.")

if __name__ == '__main__':
    run_kmeans_pipeline()
