Skip to content

Instantly share code, notes, and snippets.

@r1cc4rd0m4zz4
Created September 11, 2024 23:44
Show Gist options
  • Save r1cc4rd0m4zz4/ebaa99bebc5bdc0af1325aac923d61a1 to your computer and use it in GitHub Desktop.
Save r1cc4rd0m4zz4/ebaa99bebc5bdc0af1325aac923d61a1 to your computer and use it in GitHub Desktop.
This Python script extracts text from PDF files using three different libraries: `pdfminer`, `unstructured`, and `pymupdf`. It allows you to choose the extraction method via command-line arguments and saves the extracted text to a specified output file or prints it to the terminal.
#!/usr/bin/env python3
#python3 -m pip install pymupdf unstructured pdfminer
#chmod +x pdf2text.py
#python3 pdf2text.py path/to/your/pdf/file.pdf -m unstructured -o output_file
import warnings
import argparse
import os
import importlib
import sys
#import pymupdf
#from unstructured.partition.auto import partition
#from pdfminer.high_level import extract_text
def extract_text_pdfminer(file_path):
extract_text_module = importlib.import_module("pdfminer.high_level")
extract_text = getattr(extract_text_module, "extract_text")
"""
Extracts text from a PDF file using PDFMiner.
Args:
file_path (str): The path of the PDF file from which to extract text.
Returns:
str: The text extracted from the PDF file.
"""
# Use the extract_text function from PDFMiner to extract the text
text = extract_text(file_path)
return text
def extract_text_from_pdf_unstructured(pdf_path):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
partition_module = importlib.import_module("unstructured.partition.auto")
partition = getattr(partition_module, "partition")
elements = partition(pdf_path, strategy="hi_res")
text = "\n".join([str(el) for el in elements])
return text
def extract_text_from_pdf_pymupdf(pdf_path):
pymupdf = importlib.import_module("fitz")
doc = pymupdf.open(pdf_path)
print(f"Number of pages: {len(doc)}")
text = ""
for page in doc:
text += page.get_text()
return text
def save_text(text, base_file_path):
index = 1
file_path = base_file_path
while os.path.exists(file_path + ".txt"):
file_path = f"{base_file_path}_{index}"
index += 1
file_path += ".txt"
with open(file_path, "w") as file:
file.write(text)
print(f"Text saved in: {file_path}")
def main():
parser = argparse.ArgumentParser(description="Extracts text from a PDF file and saves it to a text file.")
parser.add_argument("pdf_path", help="The path of the PDF file from which to extract text.")
parser.add_argument("-o", "--output", help="The path of the output text file. If not specified, the text will be printed to the terminal.")
parser.add_argument("-m", "--method", choices=["unstructured", "pdfminer", "pymupdf"], default="unstructured", help="The text extraction method: 'unstructured' (default) or 'pdfminer' or 'pymupdf'.")
args = parser.parse_args()
if args.method == "unstructured":
text = extract_text_from_pdf_unstructured(args.pdf_path)
elif args.method == "pdfminer":
text = extract_text_pdfminer(args.pdf_path)
elif args.method == "pymupdf":
text = extract_text_from_pdf_pymupdf(args.pdf_path)
else:
print(f"Unsupported engine: {args.method}")
sys.exit(1)
if args.output:
save_text(text, args.output)
else:
print(text)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment