jargonsdev
diff --git a/‎.github/workflows/update-vector-store.yml‎
Lines changed: 170 additions & 0 deletions b/‎.github/workflows/update-vector-store.yml‎
Lines changed: 170 additions & 0 deletions
diff --git a/‎dev/README.md‎
Lines changed: 128 additions & 12 deletions b/‎dev/README.md‎
Lines changed: 128 additions & 12 deletions
@@ -0,0 +1,170 @@
+name: Update Vector Store (Qdrant)
+
+on:
+  deployment_status:
+
+  # Allow manual triggering with custom slugs
+  workflow_dispatch:
+    inputs:
+      upsert_slugs:
+        description: "Comma-separated slugs to upsert (e.g. api,closure)"
+        required: false
+      delete_slugs:
+        description: "Comma-separated slugs to delete (e.g. old-term)"
+        required: false
+
+jobs:
+  update-vector-store:
+    runs-on: ubuntu-latest
+
+    # Only run on successful production deployments (or manual trigger)
+    if: >
+      github.event_name == 'workflow_dispatch' ||
+      (
+        github.event.deployment_status.state == 'success' &&
+        github.event.deployment.environment == 'Production'
+      )
+
+    steps:
+      # ── Gate: Check that the merged PR has a dictionary label ──────────
+      - name: Check PR labels
+        if: github.event_name != 'workflow_dispatch'
+        id: pr-check
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const sha = context.payload.deployment.sha;
+
+            // Find PRs associated with this deployment commit
+            const { data: prs } = await github.rest.repos.listPullRequestsAssociatedWithCommit({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              commit_sha: sha,
+            });
+
+            // Find the merged PR targeting main
+            const mergedPR = prs.find(pr => pr.merged_at && pr.base.ref === 'main');
+
+            if (!mergedPR) {
+              core.info('No merged PR found for this deployment. Skipping.');
+              core.setOutput('should_continue', 'false');
+              return;
+            }
+
+            const labels = mergedPR.labels.map(l => l.name);
+            core.info(`PR #${mergedPR.number}: ${mergedPR.title}`);
+            core.info(`Labels: ${labels.join(', ')}`);
+
+            const requiredLabels = ['📖edit-word', '📖new-word'];
+            const hasRequiredLabel = labels.some(l => requiredLabels.includes(l));
+
+            if (!hasRequiredLabel) {
+              core.info(`PR does not have required labels (${requiredLabels.join(', ')}). Skipping.`);
+              core.setOutput('should_continue', 'false');
+              return;
+            }
+
+            core.info('✅ PR has required label. Proceeding with update.');
+            core.setOutput('should_continue', 'true');
+
+      - name: Skip — PR lacks required labels
+        if: github.event_name != 'workflow_dispatch' && steps.pr-check.outputs.should_continue != 'true'
+        run: |
+          echo "⏭️ Skipping: deployment is not from a 📖new-word or 📖edit-word PR."
+
+      # ── Checkout & detect changed dictionary files ─────────────────────
+      - name: Checkout repository
+        if: github.event_name == 'workflow_dispatch' || steps.pr-check.outputs.should_continue == 'true'
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.deployment.sha || github.sha }}
+          fetch-depth: 2
+
+      - name: Detect changed dictionary files
+        if: github.event_name == 'workflow_dispatch' || steps.pr-check.outputs.should_continue == 'true'
+        id: detect
+        run: |
+          # For manual triggers, use the provided inputs directly
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            UPSERT="${{ github.event.inputs.upsert_slugs }}"
+            DELETE="${{ github.event.inputs.delete_slugs }}"
+
+            if [ -z "$UPSERT" ] && [ -z "$DELETE" ]; then
+              echo "has_changes=false" >> "$GITHUB_OUTPUT"
+              echo "No slugs provided for manual trigger."
+            else
+              echo "upsert=$UPSERT" >> "$GITHUB_OUTPUT"
+              echo "delete=$DELETE" >> "$GITHUB_OUTPUT"
+              echo "has_changes=true" >> "$GITHUB_OUTPUT"
+            fi
+            exit 0
+          fi
+
+          # For deployment triggers, diff against the parent commit
+          echo "Detecting dictionary file changes..."
+          UPSERT_SLUGS=""
+          DELETE_SLUGS=""
+
+          while IFS=$'\t' read -r status file; do
+            if [[ "$file" == src/content/dictionary/*.mdx ]]; then
+              slug=$(basename "$file" .mdx)
+
+              if [[ "$status" == "D" ]]; then
+                DELETE_SLUGS="${DELETE_SLUGS:+$DELETE_SLUGS,}$slug"
+              else
+                UPSERT_SLUGS="${UPSERT_SLUGS:+$UPSERT_SLUGS,}$slug"
+              fi
+            fi
+          done < <(git diff --name-status HEAD~1 -- src/content/dictionary/)
+
+          if [ -z "$UPSERT_SLUGS" ] && [ -z "$DELETE_SLUGS" ]; then
+            echo "has_changes=false" >> "$GITHUB_OUTPUT"
+            echo "No dictionary file changes detected. Skipping update."
+          else
+            echo "upsert=$UPSERT_SLUGS" >> "$GITHUB_OUTPUT"
+            echo "delete=$DELETE_SLUGS" >> "$GITHUB_OUTPUT"
+            echo "has_changes=true" >> "$GITHUB_OUTPUT"
+            echo "Upsert slugs: $UPSERT_SLUGS"
+            echo "Delete slugs: $DELETE_SLUGS"
+          fi
+
+      - name: Skip — no dictionary changes
+        if: steps.detect.outputs.has_changes != 'true' && (github.event_name == 'workflow_dispatch' || steps.pr-check.outputs.should_continue == 'true')
+        run: echo "⏭️ No dictionary changes to process. Skipping."
+
+      # ── Run the incremental update ─────────────────────────────────────
+      - name: Setup Node.js
+        if: steps.detect.outputs.has_changes == 'true'
+        uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+          cache: "npm"
+
+      - name: Install dependencies
+        if: steps.detect.outputs.has_changes == 'true'
+        run: npm ci
+
+      - name: Update vector store
+        if: steps.detect.outputs.has_changes == 'true'
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          OPENAI_EMBEDDINGS_MODEL: ${{ vars.OPENAI_EMBEDDINGS_MODEL }}
+          QDRANT_URL: ${{ secrets.QDRANT_URL }}
+          QDRANT_API_KEY: ${{ secrets.QDRANT_API_KEY }}
+        run: |
+          ARGS=""
+
+          if [ -n "${{ steps.detect.outputs.upsert }}" ]; then
+            ARGS="$ARGS --upsert ${{ steps.detect.outputs.upsert }}"
+          fi
+
+          if [ -n "${{ steps.detect.outputs.delete }}" ]; then
+            ARGS="$ARGS --delete ${{ steps.detect.outputs.delete }}"
+          fi
+
+          echo "Running: npm run update:jai:ci -- $ARGS"
+          npm run update:jai:ci -- $ARGS
+
+      - name: Update successful
+        if: steps.detect.outputs.has_changes == 'true'
+        run: echo "✅ Vector store update completed successfully"
@@ -40,7 +40,6 @@ Before running this script, ensure you have:
 - All dependencies installed (`npm ci`)
 - `OPENAI_API_KEY`, `QDRANT_URL`  and `QDRANT_API_KEY` environment variables properly configured in your `.env` file
 - Network access to fetch from jargons.dev API
-- Sufficient disk space for temporary dictionary file
 
 ### Usage
 
@@ -53,19 +52,17 @@ npm run seed:jai
 The script performs these steps to prepare ✨jAI's knowledge base:
 
 1. **Data Fetching**: Downloads the complete dictionary from `https://jargons.dev/api/v1/browse`
-2. **File Processing**: Saves data locally and loads it using LangChain's JSONLoader
-3. **Document Splitting**: Breaks content into optimally-sized chunks (1000 chars with 200 overlap)
+2. **Document Creation**: Creates LangChain `Document` objects directly from the API response, attaching `slug` metadata to each word for future incremental updates
+3. **Document Splitting**: Breaks content into optimally-sized chunks (1000 chars with 200 overlap), preserving the slug metadata on every chunk
 4. **Vector Store Population**: Adds processed documents to ✨jAI's vector store in batches of 100
-5. **Cleanup**: Removes temporary files and provides completion summary
 
 ### Technical Implementation
 
 The script leverages several key technologies:
 
-- **LangChain JSONLoader**: Extracts title and content fields from dictionary entries
+- **LangChain Document**: Creates documents directly from API data with `metadata.slug` for traceability
 - **RecursiveCharacterTextSplitter**: Intelligently splits text while preserving context
 - **Batch Processing**: Prevents memory issues and provides progress feedback
-- **File System Operations**: Handles temporary file creation and cleanup
 
 ### Configuration Options
 
@@ -85,25 +82,144 @@ Key parameters that can be adjusted:
 
 The script includes robust error handling for:
 - Network connectivity issues during API calls
-- File system errors during temporary file operations
 - Vector store connection problems
 - Memory management during large batch processing
 
 ### Example Output
 
 ```
-Saved the dictionary file to /path/to/dev/dictionary.json
-Loaded 500 documents
-Split 1250 documents
+Fetched 500 words from the API
+Created 500 documents
+Split into 1250 chunks
 Added batch 1 of 13 (100 documents) to the vector store
 Added batch 2 of 13 (100 documents) to the vector store
 ...
 Added 1250 splits to the vector store
-Cleaned up the dictionary file at /path/to/dev/dictionary.json
 ```
 
 Once completed, ✨jAI will have access to the processed dictionary content and can provide intelligent responses about software engineering terms.
 
+> **Note:** After running a full seed, all vector points will include `metadata.slug`, which is required for incremental updates via the [Update Vector Store Script](#update-vector-store-script) to work correctly.
+
+## Update Vector Store Script
+
+This script performs **incremental updates** to ✨jAI's vector store when dictionary words are added, modified, or removed. Instead of re-seeding the entire collection, it targets only the changed words — making it fast and efficient for CI/CD use after new words are merged.
+
+### When to Use
+
+This script is primarily run automatically via the **Update Vector Store** GitHub Actions workflow when a new word PR is merged and the Vercel production deployment succeeds. You can also run it manually when you need to:
+- Add or update specific words in the vector store
+- Remove deleted words from the vector store
+- Fix vector store entries for particular terms
+
+### Prerequisites
+
+Before running this script, ensure you have:
+- All dependencies installed (`npm ci`)
+- `OPENAI_API_KEY`, `OPENAI_EMBEDDINGS_MODEL`, `QDRANT_URL` and `QDRANT_API_KEY` environment variables properly configured in your `.env` file
+- Network access to fetch from the jargons.dev production API
+- The vector store has been initially seeded with `metadata.slug` on all points (via `npm run seed:jai`)
+
+### Usage
+
+**Local Development:**
+```bash
+npm run update:jai -- --upsert slug1,slug2 --delete slug3,slug4
+```
+
+**CI/CD (without .env file):**
+```bash
+npm run update:jai:ci -- --upsert slug1,slug2 --delete slug3
+```
+
+### Flags
+
+- `--upsert <slugs>` — Comma-separated slugs of words to add or update. For each slug, the script deletes any existing chunks in Qdrant (by `metadata.slug` filter), fetches the latest content from the production API, splits it into chunks, and adds them to the vector store.
+- `--delete <slugs>` — Comma-separated slugs of words to remove. Deletes all chunks matching the slug from Qdrant.
+
+Both flags are optional, but at least one must be provided for the script to do anything.
+
+### How It Works
+
+The script performs these steps for each word:
+
+**For upserts (add/update):**
+1. **Delete Old Chunks**: Removes existing vector points matching `metadata.slug` via a Qdrant filter
+2. **Fetch Latest Content**: Downloads the word from `https://jargons.dev/api/v1/browse/{slug}`
+3. **Create Document**: Builds a LangChain `Document` with `metadata.slug` for traceability
+4. **Split into Chunks**: Breaks content into optimally-sized chunks (1000 chars with 200 overlap)
+5. **Add to Vector Store**: Upserts the new chunks into Qdrant
+
+**For deletes:**
+1. **Delete Chunks**: Removes all vector points matching `metadata.slug` via a Qdrant filter
+
+### Technical Implementation
+
+The script leverages several key technologies:
+
+- **LangChain Document**: Creates documents with `metadata.slug` for targeted updates
+- **Qdrant Filter-based Deletion**: Uses `vectorStore.delete({ filter })` with a `metadata.slug` match condition to precisely target existing chunks for a word
+- **RecursiveCharacterTextSplitter**: Same chunking config as the seed script (1000/200) for consistency
+- **Production API**: Fetches from the deployed site to ensure the vector store matches the live content
+
+### Configuration Options
+
+Required environment variables:
+
+- **QDRANT_URL**: Your Qdrant cluster endpoint (e.g., `https://your-cluster.gcp.cloud.qdrant.io`)
+- **QDRANT_API_KEY**: Your Qdrant cluster API key for authentication
+- **OPENAI_API_KEY**: Your OpenAI API Key for generating embeddings
+- **OPENAI_EMBEDDINGS_MODEL**: The embeddings model to use (e.g., `text-embedding-3-small`)
+
+### Automated via GitHub Actions
+
+The **Update Vector Store** workflow (`.github/workflows/update-vector-store.yml`) runs this script automatically:
+
+- **Trigger**: Fires on `deployment_status` events — specifically when Vercel reports a successful **Production** deployment
+- **PR Label Gate**: Uses the GitHub API to find the merged PR associated with the deployment commit and checks for the `📖new-word` or `📖edit-word` labels. Deployments from PRs without these labels are skipped early (before any Node.js setup or dependency installation)
+- **Change Detection**: Diffs `HEAD~1` to identify added, modified, or deleted `.mdx` files in `src/content/dictionary/`
+- **Skip Logic**: Exits early if no dictionary files were changed in the commit
+- **Manual Trigger**: Can also be run manually from the GitHub Actions tab with custom `upsert_slugs` and `delete_slugs` inputs (bypasses the label check)
+- **Required Secrets**: `OPENAI_API_KEY`, `QDRANT_URL`, `QDRANT_API_KEY`
+- **Required Variables**: `OPENAI_EMBEDDINGS_MODEL`
+
+### Error Handling
+
+The script includes robust error handling for:
+- Unknown flags or flags missing required values (prints an error with usage instructions and exits with code 1)
+- No slugs provided (prints usage and exits gracefully with code 0)
+- Words not found on the production API (404 — warns and continues with remaining slugs)
+- Network connectivity issues
+- Vector store connection and deletion failures
+- Per-word error isolation (one failing slug doesn't block the others)
+- Non-zero exit code if any operation fails
+
+### Example Output
+
+```
+🚀 Starting incremental vector store update...
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+📝 Words to upsert: api, closure
+🗑️  Words to delete: old-term
+
+🔄 Processing upsert for "api"...
+   Deleting old chunks for "api"...
+   Split into 3 chunk(s).
+   ✅ Upserted "api" (3 chunks)
+
+🔄 Processing upsert for "closure"...
+   Deleting old chunks for "closure"...
+   Split into 2 chunk(s).
+   ✅ Upserted "closure" (2 chunks)
+
+🗑️  Deleting "old-term" from vector store...
+   ✅ Deleted "old-term"
+
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+✨ Done! Upserted: 2, Deleted: 1, Failed: 0
+🎉 Vector store update completed successfully!
+```
+
 ## Vector Store Cluster Ping Script
 
 This script performs a lightweight health check on the Vector Store (Qdrant) cluster to keep it active and prevent automatic deletion due to inactivity. It's designed to be run both locally for testing and automatically via GitHub Actions.
@@ -164,7 +280,7 @@ Required environment variables:
 ### Automated Scheduling
 
 The script is automatically run via GitHub Actions:
-- **Schedule**: Every Sunday at 2 AM UTC
+- **Schedule**: Every Sunday and Wednesday at midnight UTC
 - **Manual Trigger**: Can be run manually from GitHub Actions tab
 - **Purpose**: Prevents cluster deletion due to inactivity