-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathultra_simple.py
More file actions
133 lines (104 loc) · 3.55 KB
/
Copy pathultra_simple.py
File metadata and controls
133 lines (104 loc) · 3.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!python
"""
Ultra Simple PDF Q&A - Just ask and get answers
"""
import pdfplumber
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import ollama
import os
def load_pdf():
"""Find and load the Arabic PDF"""
pdf_files = [f for f in os.listdir(".") if f.endswith(".pdf")]
if not pdf_files:
print("❌ No PDF files found!")
return None
# Prefer Arabic PDF
arabic_pdf = None
for f in pdf_files:
if "النظام" in f or "الأساسي" in f:
arabic_pdf = f
break
pdf_file = arabic_pdf if arabic_pdf else pdf_files[0]
print(f"📄 Loading: {pdf_file}")
text = ""
with pdfplumber.open(pdf_file) as pdf:
for page_num, page in enumerate(pdf.pages):
page_text = page.extract_text()
if page_text:
text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
return text
def setup_rag(text):
"""Setup RAG system"""
print("🔧 Setting up RAG system...")
# Chunk text
words = text.split()
chunks = []
chunk_size = 500
overlap = 100
for i in range(0, len(words), chunk_size - overlap):
chunk_words = words[i:i + chunk_size]
chunk_text = ' '.join(chunk_words)
chunks.append(chunk_text)
# Create embeddings
print("📊 Creating embeddings...")
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
embeddings = model.encode(chunks, show_progress_bar=True)
# Build vector store
print("🏗️ Building vector store...")
index = faiss.IndexFlatIP(embeddings.shape[1])
faiss.normalize_L2(embeddings)
index.add(embeddings.astype('float32'))
print(f"✅ Ready! Created {len(chunks)} chunks.")
return model, index, chunks
def ask_question(question, model, index, chunks, top_k=5):
"""Get answer for question"""
# Search similar chunks
query_embedding = model.encode([question])
faiss.normalize_L2(query_embedding)
scores, indices = index.search(query_embedding.astype('float32'), top_k)
# Get relevant chunks
relevant_chunks = [chunks[idx] for idx in indices[0] if idx < len(chunks)]
context = "\n\n".join(relevant_chunks)
# Get answer from Ollama
prompt = f"""
Based ONLY on the following context, answer the question in Arabic.
If the answer is not in the context, say "Information not Available".
Context: {context}
Question: {question}
Answer:"""
try:
response = ollama.chat(
model="llama3.2",
messages=[{'role': 'user', 'content': prompt}]
)
return response['message']['content']
except Exception as e:
return f"Error: {str(e)}"
def main():
print("🚀 PDF Q&A System")
print("=" * 50)
# Load PDF
text = load_pdf()
if not text:
return
# Setup RAG
model, index, chunks = setup_rag(text)
print("\n💬 Ask questions (type 'quit' to exit):")
print("=" * 50)
while True:
try:
question = input("\n❓ ").strip()
if question.lower() in ['quit', 'exit', 'q']:
print("👋 Goodbye!")
break
if question:
print("🤖 Thinking...")
answer = ask_question(question, model, index, chunks)
print(f"\n📝 {answer}")
except KeyboardInterrupt:
print("\n👋 Goodbye!")
break
if __name__ == "__main__":
main()