Paper2Video/test_pdf.py at main · Gen-Verse/Paper2Video · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import fitz  # PyMuPDF
import io
from PIL import Image
import os

def extract_and_combine_images(pdf_path, page_num, output_folder, image_index=0):
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_num)

    # Extract images from the page
    page_images = page.get_images(full=True)

    image_list = []

    for img_index, img in enumerate(page_images):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        img_ext = base_image["ext"]

        # Get the image width and height
        img_width = base_image["width"]
        img_height = base_image["height"]

        # Get the image position on the page
        img_rect = img[1]  # This should be a tuple of (x0, y0, x1, y1) for image position
        print(f"img_rect: {img_rect}")  # Inspect img_rect

        # Check if img_rect is a tuple and unpack it
        if isinstance(img_rect, tuple) and len(img_rect) == 4:
            x0, y0, x1, y1 = img_rect
        else:
            print(f"Unexpected img_rect format: {img_rect}")
            continue  # Skip this image if the format is incorrect

        # Open the image data with PIL
        image = Image.open(io.BytesIO(image_bytes))

        # Store image data and its position
        image_list.append((image, x0, y0, img_width, img_height))

        print(f"✅ extract {image_index}: {img_ext} at position ({x0}, {y0})")
        image_index += 1

    # Now stitch the images together to create a large image
    if image_list:
        # Determine the size of the final image (max x and y positions + image dimensions)
        max_x = max([x0 + img_width for _, x0, _, img_width, _ in image_list])
        max_y = max([y0 + img_height for _, _, y0, _, img_height in image_list])

        # Create a blank canvas for the final large image
        large_image = Image.new("RGB", (max_x, max_y))

        # Paste each image onto the canvas at the correct position
        for image, x0, y0, _, _ in image_list:
            large_image.paste(image, (x0, y0))

        # Save the combined image
        combined_image_path = os.path.join(output_folder, f"combined_figure.png")
        large_image.save(combined_image_path)
        print(f"✅ image is saved at：{combined_image_path}")


# Example usage
# pdf_path = "C:\\Users\\87719\\Desktop\\AgenticIR-main\\dataset\\ControlNet.pdf"  # Replace with your PDF file path
# page_num = 0  # Replace with the page number where the images are located
# output_folder = "output_1"  # Replace with the output folder

# extract_and_combine_images(pdf_path, page_num, output_folder)