-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
73 lines (61 loc) · 2.86 KB
/
main.py
File metadata and controls
73 lines (61 loc) · 2.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
from llm import embeddings
from langchain_core.documents import Document
from pathlib import Path
IMAGE_RESOLUTION_SCALE = 2.0
output_dir = Path("scratch")
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
# source = "https://arxiv.org/pdf/2408.09869"
# source = "https://www.w3.org/TR/2013/NOTE-WCAG20-TECHS-20130905/working-examples/PDF20/table.pdf"
source = "https://nlsblog.org/wp-content/uploads/2020/06/image-based-pdf-sample.pdf"
conv_res = doc_converter.convert(source)
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = Path(source).stem
# # Save page images
# for page_no, page in conv_res.document.pages.items():
# page_no = page.page_no
# page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
# with page_image_filename.open("wb") as fp:
# page.image.pil_image.save(fp, format="PNG")
# # Save images of figures and tables
# table_counter = 0
# picture_counter = 0
for element, _level in conv_res.document.iterate_items():
# if isinstance(element, TableItem):
# table_counter += 1
# element_image_filename = (
# output_dir / f"{doc_filename}-table-{table_counter}.png"
# )
# with element_image_filename.open("wb") as fp:
# element.get_image(conv_res.document).save(fp, "PNG")
# if isinstance(element, PictureItem):
# picture_counter += 1
# element_image_filename = (
# output_dir / f"{doc_filename}-picture-{picture_counter}.png"
# )
# with element_image_filename.open("wb") as fp:
# element.get_image(conv_res.document).save(fp, "PNG")
if isinstance(element, TextItem):
print(f"TextItem: {element.text[:30]}...")
# # Save markdown with embedded pictures
# md_filename = output_dir / f"{doc_filename}-with-images.md"
# conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)
# # Save markdown with externally referenced pictures
# md_filename = output_dir / f"{doc_filename}-with-image-refs.md"
# conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)
# # Save HTML with externally referenced pictures
# html_filename = output_dir / f"{doc_filename}-with-image-refs.html"
# conv_res.document.save_as_html(html_filename, image_mode=ImageRefMode.REFERENCED)