learn_docling/main.py at master · riteshhub/learn_docling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
from llm import embeddings
from langchain_core.documents import Document
from pathlib import Path

IMAGE_RESOLUTION_SCALE = 2.0
output_dir = Path("scratch")

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

# source = "https://arxiv.org/pdf/2408.09869"
# source = "https://www.w3.org/TR/2013/NOTE-WCAG20-TECHS-20130905/working-examples/PDF20/table.pdf"
source = "https://nlsblog.org/wp-content/uploads/2020/06/image-based-pdf-sample.pdf"

conv_res = doc_converter.convert(source)
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = Path(source).stem
# # Save page images
# for page_no, page in conv_res.document.pages.items():
#     page_no = page.page_no
#     page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
#     with page_image_filename.open("wb") as fp:
#         page.image.pil_image.save(fp, format="PNG")


# # Save images of figures and tables
# table_counter = 0
# picture_counter = 0
for element, _level in conv_res.document.iterate_items():
#     if isinstance(element, TableItem):
#         table_counter += 1
#         element_image_filename = (
#             output_dir / f"{doc_filename}-table-{table_counter}.png"
#         )
#         with element_image_filename.open("wb") as fp:
#             element.get_image(conv_res.document).save(fp, "PNG")

#     if isinstance(element, PictureItem):
#         picture_counter += 1
#         element_image_filename = (
#             output_dir / f"{doc_filename}-picture-{picture_counter}.png"
#         )
#         with element_image_filename.open("wb") as fp:
#             element.get_image(conv_res.document).save(fp, "PNG")

    if isinstance(element, TextItem):
        print(f"TextItem: {element.text[:30]}...")

# # Save markdown with embedded pictures
# md_filename = output_dir / f"{doc_filename}-with-images.md"
# conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

# # Save markdown with externally referenced pictures
# md_filename = output_dir / f"{doc_filename}-with-image-refs.md"
# conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)

# # Save HTML with externally referenced pictures
# html_filename = output_dir / f"{doc_filename}-with-image-refs.html"
# conv_res.document.save_as_html(html_filename, image_mode=ImageRefMode.REFERENCED)