-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
212 lines (188 loc) · 6.82 KB
/
main.py
File metadata and controls
212 lines (188 loc) · 6.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""
This script performs the following tasks:
1. Selects a folder containing e-invoice archives.
2. Resets the target folders by deleting their contents and recreating them.
3. Extracts PDF files from ZIP archives in the selected folder.
4. Processes the extracted PDFs to split them based on invoice details.
5. Renames and organizes the split PDF files.
"""
from datetime import datetime
import os
import re
import shutil
from PyPDF2 import PdfReader, PdfWriter
import zipfile
from pathlib import Path
import tkinter as tk
from tkinter import filedialog
# Constants
einv_det: str = "e-Invoice Details"
doc_no_ptn: str = r"Document No. :\s*([\w\s\W]+?)\s*IGST"
doc_dt_ptn: str = r"Document Date : (\d{2}-\d{2}-\d{4})"
ack_dt_ptn: str = r"Ack Date : (\d{2}-\d{2}-\d{4})"
einv_name = "einv1"
op_folder = "output"
ar_folder = "archives"
ei_folder = "einvoices"
cwd_folder = Path.cwd()
# Define paths using Path objects
ei_path = cwd_folder.joinpath(ei_folder)
op_path = cwd_folder.joinpath(op_folder)
ar_path = cwd_folder.joinpath(ar_folder)
def select_folder():
"""
Opens a dialog to select a folder containing e-invoice archives.
Returns:
Path: The path to the selected folder.
"""
root = tk.Tk()
root.withdraw() # Hide the root window
folder_path = filedialog.askdirectory(
title="Select the folder containing e-invoice Archives"
)
return Path(folder_path)
def reset_folder(folder: Path):
"""
Deletes a folder and its contents if it exists, then recreates it.
Args:
folder (Path): The path to the folder to reset.
"""
if folder.exists():
try:
shutil.rmtree(folder) # Delete folder and contents
print(f"Deleted folder: {folder}")
except OSError as e:
print(f"Error deleting folder: {e}")
try:
folder.mkdir(parents=True, exist_ok=True) # Recreate folder
print(f"Created folder: {folder}")
except OSError as e:
print(f"Error creating folder: {e}")
def extract_files():
"""
Extracts PDF files from ZIP archives in the specified archive folder.
Renames the extracted files based on the ZIP file's name.
"""
for zip_file in ar_path.glob("*.zip"):
try:
with zipfile.ZipFile(zip_file, "r") as archive:
try:
archive.extract(f"{einv_name}.pdf", path=ei_path)
extracted_file = ei_path.joinpath(f"{einv_name}.pdf")
new_file_name = f"{zip_file.stem}.pdf"
extracted_file.rename(ei_path.joinpath(new_file_name))
except FileNotFoundError:
print(f"{einv_name}.pdf not found in {zip_file}.")
except PermissionError:
print(
f"Permission denied while extracting or renaming file from {zip_file}."
)
except Exception as e:
print(f"Error during ZIP file extraction: {e}")
except zipfile.BadZipFile:
print(f"File {zip_file} is not a valid ZIP file.")
except Exception as e:
print(f"Error opening ZIP file {zip_file}: {e}")
def extract_reg(text, pattern):
"""
Extracts a substring from text that matches a given regular expression pattern.
Args:
text (str): The text to search in.
pattern (str): The regular expression pattern.
Returns:
str or None: The matched substring, or None if no match is found.
"""
match = re.search(pattern, text)
return match.group(1) if match else None
def convert_date_format(date_str):
"""
Converts a date string from 'dd-mm-yyyy' to 'yyyymmdd'.
Args:
date_str (str): The date string in 'dd-mm-yyyy' format.
Returns:
str: The date string in 'yyyymmdd' format.
"""
day, month, year = date_str.split("-")
return year + month + day
def escape_filename(filename):
"""
Replaces invalid characters in filenames with a hyphen.
Args:
filename (str): The filename to sanitize.
Returns:
str: The sanitized filename.
"""
invalid_chars = r'[<>:"/\\|?*]'
return re.sub(invalid_chars, "-", filename)
def split_invoices():
"""
Processes PDF files in the e-invoices folder to split them based on invoice details.
Renames and organizes the split PDF files.
"""
for file in ei_path.glob("*.pdf"):
try:
pdf = PdfReader(file)
except FileNotFoundError:
print(f"File {file} not found.")
continue
except PermissionError:
print(f"Permission denied while opening file {file}.")
continue
except Exception as e:
print(f"Error opening PDF file {file}: {e}")
continue
output = PdfWriter()
new_file_name = file.stem
doc_no = "einvoice"
for i, page in enumerate(pdf.pages):
try:
txt = page.extract_text()
except Exception as e:
print(f"Error extracting text from page {i} of {file}: {e}")
continue
if einv_det in txt:
ack_date = extract_reg(txt, ack_dt_ptn)
doc_no = escape_filename(extract_reg(txt, doc_no_ptn))
if i == 0:
new_file_name = convert_date_format(ack_date)
else:
if output.pages:
try:
with open(op_path.joinpath(doc_no + ".pdf"), "wb") as p:
output.write(p)
except IOError as e:
print(f"Error writing file {doc_no}.pdf: {e}")
continue
output = PdfWriter()
output.add_page(page)
else:
output.add_page(page)
if output.pages:
try:
with open(op_path.joinpath(doc_no + ".pdf"), "wb") as p:
output.write(p)
except IOError as e:
print(f"Error writing final file {doc_no}.pdf: {e}")
try:
file.rename(ei_path.joinpath(new_file_name + ".pdf"))
except FileNotFoundError:
print(f"File {file} not found for renaming.")
except PermissionError:
print(f"Permission denied while renaming file {file}.")
except Exception as e:
print(f"Error renaming file {file}: {e}")
if __name__ == "__main__":
"""
Main execution block of the script.
"""
ar_path = select_folder()
if ar_path:
ar_path = Path(ar_path) # Ensure the selected path is a Path object
print(f"Selected folder: {ar_path}")
reset_folder(ei_path)
reset_folder(op_path)
extract_files()
split_invoices()
print("Done")
else:
print("No folder selected")