Created
September 11, 2024 23:44
-
-
Save r1cc4rd0m4zz4/ebaa99bebc5bdc0af1325aac923d61a1 to your computer and use it in GitHub Desktop.
This Python script extracts text from PDF files using three different libraries: `pdfminer`, `unstructured`, and `pymupdf`. It allows you to choose the extraction method via command-line arguments and saves the extracted text to a specified output file or prints it to the terminal.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
#python3 -m pip install pymupdf unstructured pdfminer | |
#chmod +x pdf2text.py | |
#python3 pdf2text.py path/to/your/pdf/file.pdf -m unstructured -o output_file | |
import warnings | |
import argparse | |
import os | |
import importlib | |
import sys | |
#import pymupdf | |
#from unstructured.partition.auto import partition | |
#from pdfminer.high_level import extract_text | |
def extract_text_pdfminer(file_path): | |
extract_text_module = importlib.import_module("pdfminer.high_level") | |
extract_text = getattr(extract_text_module, "extract_text") | |
""" | |
Extracts text from a PDF file using PDFMiner. | |
Args: | |
file_path (str): The path of the PDF file from which to extract text. | |
Returns: | |
str: The text extracted from the PDF file. | |
""" | |
# Use the extract_text function from PDFMiner to extract the text | |
text = extract_text(file_path) | |
return text | |
def extract_text_from_pdf_unstructured(pdf_path): | |
with warnings.catch_warnings(): | |
warnings.simplefilter("ignore") | |
partition_module = importlib.import_module("unstructured.partition.auto") | |
partition = getattr(partition_module, "partition") | |
elements = partition(pdf_path, strategy="hi_res") | |
text = "\n".join([str(el) for el in elements]) | |
return text | |
def extract_text_from_pdf_pymupdf(pdf_path): | |
pymupdf = importlib.import_module("fitz") | |
doc = pymupdf.open(pdf_path) | |
print(f"Number of pages: {len(doc)}") | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
def save_text(text, base_file_path): | |
index = 1 | |
file_path = base_file_path | |
while os.path.exists(file_path + ".txt"): | |
file_path = f"{base_file_path}_{index}" | |
index += 1 | |
file_path += ".txt" | |
with open(file_path, "w") as file: | |
file.write(text) | |
print(f"Text saved in: {file_path}") | |
def main(): | |
parser = argparse.ArgumentParser(description="Extracts text from a PDF file and saves it to a text file.") | |
parser.add_argument("pdf_path", help="The path of the PDF file from which to extract text.") | |
parser.add_argument("-o", "--output", help="The path of the output text file. If not specified, the text will be printed to the terminal.") | |
parser.add_argument("-m", "--method", choices=["unstructured", "pdfminer", "pymupdf"], default="unstructured", help="The text extraction method: 'unstructured' (default) or 'pdfminer' or 'pymupdf'.") | |
args = parser.parse_args() | |
if args.method == "unstructured": | |
text = extract_text_from_pdf_unstructured(args.pdf_path) | |
elif args.method == "pdfminer": | |
text = extract_text_pdfminer(args.pdf_path) | |
elif args.method == "pymupdf": | |
text = extract_text_from_pdf_pymupdf(args.pdf_path) | |
else: | |
print(f"Unsupported engine: {args.method}") | |
sys.exit(1) | |
if args.output: | |
save_text(text, args.output) | |
else: | |
print(text) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment