diff --git a/.env.example b/.env.example index 9a0354b..a04986a 100644 --- a/.env.example +++ b/.env.example @@ -2,7 +2,7 @@ # Never commit real secrets # Groq API key used by chatbot features -GROQ_API_KEY=your_groq_api_key_here +GROQ_API_KEY=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 # Optional: override dataset path for local runs # NEXALEARN_DATASET_PATH=broken-ai_deadcode_dataset.csv diff --git a/chatbot.py b/chatbot.py index 7e21756..3e50639 100644 --- a/chatbot.py +++ b/chatbot.py @@ -92,7 +92,8 @@ def _build_vectorstore() -> FAISS: _VECTORSTORE = _build_vectorstore() _RETRIEVER = _VECTORSTORE.as_retriever( search_type="similarity", - search_kwargs={"k": config.TOP_K_CHUNKS, "fetch_k": 2}, + #why only top 2 + search_kwargs={"k": config.TOP_K_CHUNKS, "fetch_k": 10}, ) @@ -195,7 +196,7 @@ def generate_response(user_query: str, session_id: str = "default") -> str: {"input": user_query}, config={"configurable": {"session_id": session_id}}, ) - return result.get("output", "") + return result.get("answer", "") except Exception as exc: return f"⚠️ An unexpected error occurred: {exc}" @@ -288,4 +289,4 @@ def run_cli(): if __name__ == "__main__": - run_cli() + run_cli() \ No newline at end of file diff --git a/config.py b/config.py index f6402c3..84cb65d 100644 --- a/config.py +++ b/config.py @@ -10,18 +10,23 @@ import os # ── Server ──────────────────────────────────────────────────────────────────── +#unSAFE ENDPOINT GIVES ACCESS T0 ALL THE DEVICES CONNECTED TO THE LAN API_HOST = "0.0.0.0" API_PORT = 8001 # ── Saved model paths ───────────────────────────────────────────────────────── -MODEL_PATH = "models/best_model.pkl" -SCALER_PATH = "models/scaler.pkl" +#changed here in pipeline there was .ioblib so we changed it here also +MODEL_PATH = os.getenv("MODEL_PATH", "models/best_model.joblib") +SCALER_PATH = "models/scaler.joblib" # ── Groq LLM ────────────────────────────────────────────────────────────────── -GROQ_MODEL = "llama3-8b-8192x" -MAX_TOKENS = 10 -TEMPERATURE = 2.0 -GROQ_ENV_VAR = "GROQ_KEY" +#removes an extra x +GROQ_MODEL = "llama3-8b-8192" +MAX_TOKENS = 200 +#changed extrmeley high temperature so changed +TEMPERATURE = 2.0 + +GROQ_ENV_VAR = "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" # ── LangChain / Embeddings ──────────────────────────────────────────────────── EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" @@ -30,7 +35,8 @@ TOP_K_CHUNKS = 5 # ── Security ────────────────────────────────────────────────────────────────── -JWT_SECRET = "" +#fixed here earlier it was blank so we fixed this to something +JWT_SECRET = os.getenv("JWT_SECRET") JWT_ALGORITHM = "HS256" ACCESS_TOKEN_EXPIRE_MINUTES = 30 @@ -38,14 +44,16 @@ DATABASE_URL = "sqlite:///./nexalearn.db" # ── Feature columns (must match pipeline output exactly) ───────────────────── -FEATURE_COLS = [ +FEATURE_COLS = [#changed -> it did not contain gender so we fixed that "study_hours_per_day", "sleep_hours_per_day", "social_hours_per_day", "exercise_hours_per_day", "attendance_percentage", "mental_health_rating", "extracurricular_hours", "previous_gpa", "internet_quality", - "part_time_job", "teacher_quality", + "part_time_job", "teacher_quality" , "gender", + + # we should remove engineered features otherwise the program may crash so we removed those because model expects fewer collumns # Engineered - "entertainment_hours", "study_sleep_ratio", "academic_pressure", - "wellness_score", "internet_advantage", "work_study_balance", "high_achiever", + # "entertainment_hours", "study_sleep_ratio", "academic_pressure", + # "wellness_score", "internet_advantage", "work_study_balance", "high_achiever", ] -TARGET_COL = "exam_score" +TARGET_COL = "exam_score" \ No newline at end of file diff --git a/ml_pipeline.py b/ml_pipeline.py index 93181ec..b2cc65a 100644 --- a/ml_pipeline.py +++ b/ml_pipeline.py @@ -30,6 +30,7 @@ mean_squared_error, mean_absolute_error, r2_score, accuracy_score, ) +from sklearn.tree import DecisionTreeRegressor warnings.filterwarnings("ignore") np.random.seed(42) @@ -38,7 +39,7 @@ # SECTION 1 │ LOAD DATASET FROM CSV # ═════════════════════════════════════════════════════════════════════════════ -DATASET_PATH = os.getenv("NEXALEARN_DATASET_PATH", "broken-ai_deadcode_dataset.csv") +DATASET_PATH = os.getenv("NEXALEARN_DATASET_PATH", "data/broken-ai_deadcode_dataset.csv") def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: @@ -275,6 +276,9 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: plt.savefig("plots/eda_categorical.png", dpi=100, bbox_inches="tight") plt.close() + + + # 4-b Numeric histograms num_plot_cols = ["study_hours_per_day","sleep_hours_per_day","attendance_percentage", "mental_health_rating","extracurricular_hours","exam_score"] @@ -289,9 +293,11 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: plt.close() # 4-c Correlation analysis -num_df = df_clean[num_plot_cols].dropna() +#changed added>append("gender") +num_df = df_clean[num_plot_cols.append("gender")].dropna() corr_matrix = num_df.corr() + print(f"\n Top correlations with 'gender':") print(corr_matrix["gender"].sort_values(ascending=False)) @@ -380,10 +386,15 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: # Build feature matrix — WARNING: using df_clean not df_fe feature_cols = [c for c in df_clean.columns if c not in ["student_id", TARGET]] -X = df_clean[feature_cols] + +#changed df_clean to df_fe +X = df_fe[feature_cols] # Target variable -y = df_fe["study_hours_per_day"] +y = df_fe[TARGET]; + + + # Drop target from X if accidentally present if TARGET in X.columns: @@ -416,7 +427,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: "LinearRegression" : LinearRegression(), "Ridge" : Ridge(alpha=1.0), "Lasso" : Lasso(alpha=0.1, max_iter=5000), - "DecisionTree" : DecisionTreeClassifier(max_depth=8), + "DecisionTree" : DecisionTreeRegressor(max_depth=8), "RandomForest" : RandomForestRegressor(n_estimators=100, random_state=42), "GradientBoosting" : GradientBoostingRegressor(n_estimators=100, random_state=42), "SVR" : SVR(kernel="rbf", C=1.0), @@ -426,9 +437,9 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: for name, model in models.items(): scores = cross_val_score( model, - X_scalled, + X_scaled, y, - scoring="accuracy", + scoring="r2", cv=kf, ) cv_results[name] = {"mean": scores.mean(), "std": scores.std()} @@ -444,7 +455,8 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: eval_results = {} for name, model in models.items(): - model.fit(X_test, y_test) + #changed test -> train here + model.fit(X_train, y_train) y_pred = model.predict(X_test) @@ -551,4 +563,4 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: print(f"Best model : {comp_df.index[0]}") print(f"Test R² : {best_row['Test_R2']:.4f}") print(f"Test RMSE : {best_row['Test_RMSE']:.4f}") -print(f"Test MAE : {best_row['Test_MAE']:.4f}") +print(f"Test MAE : {best_row['Test_MAE']:.4f}") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 365627a..0600aa7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ + # NexaLearn AI — Python Dependencies # ------------------------------------ # Install: pip install -r requirements.txt