r1cc4rd0m4zz4 · September 11, 2024 23:44
diff --git a/pdf2text.py b/pdf2text.py
 #!/usr/bin/env python3
 #python3 -m pip install pymupdf unstructured pdfminer
 #chmod +x pdf2text.py
 #python3 pdf2text.py path/to/your/pdf/file.pdf -m unstructured -o output_file

 import warnings
 import argparse
 import os
 import importlib
 import sys
 #import pymupdf
 #from unstructured.partition.auto import partition
 #from pdfminer.high_level import extract_text

 def extract_text_pdfminer(file_path):
    extract_text_module = importlib.import_module("pdfminer.high_level")
    extract_text = getattr(extract_text_module, "extract_text")
    """
    Extracts text from a PDF file using PDFMiner.

    Args:
    file_path (str): The path of the PDF file from which to extract text.

    Returns:
    str: The text extracted from the PDF file.
    """
    # Use the extract_text function from PDFMiner to extract the text
    text = extract_text(file_path)
    return text

 def extract_text_from_pdf_unstructured(pdf_path):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        partition_module = importlib.import_module("unstructured.partition.auto")
        partition = getattr(partition_module, "partition")    
        elements = partition(pdf_path, strategy="hi_res")
        text = "\n".join([str(el) for el in elements])
    return text

 def extract_text_from_pdf_pymupdf(pdf_path):
    pymupdf = importlib.import_module("fitz")
    doc = pymupdf.open(pdf_path)
    print(f"Number of pages: {len(doc)}")
    text = ""
    for page in doc:
        text += page.get_text()
    return text

 def save_text(text, base_file_path):
    index = 1
    file_path = base_file_path
    while os.path.exists(file_path + ".txt"):
        file_path = f"{base_file_path}_{index}"
        index += 1
    file_path += ".txt"
    with open(file_path, "w") as file:
        file.write(text)
    print(f"Text saved in: {file_path}")

 def main():
    parser = argparse.ArgumentParser(description="Extracts text from a PDF file and saves it to a text file.")
    parser.add_argument("pdf_path", help="The path of the PDF file from which to extract text.")
    parser.add_argument("-o", "--output", help="The path of the output text file. If not specified, the text will be printed to the terminal.")
    parser.add_argument("-m", "--method", choices=["unstructured", "pdfminer", "pymupdf"], default="unstructured", help="The text extraction method: 'unstructured' (default) or 'pdfminer' or 'pymupdf'.")
    
    args = parser.parse_args()
    
    if args.method == "unstructured":
        text = extract_text_from_pdf_unstructured(args.pdf_path)
    elif args.method == "pdfminer":
        text = extract_text_pdfminer(args.pdf_path)
    elif args.method == "pymupdf":
        text = extract_text_from_pdf_pymupdf(args.pdf_path)
    else:
        print(f"Unsupported engine: {args.method}")
        sys.exit(1)
    
    if args.output:
        save_text(text, args.output)
    else:
        print(text)
        
 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	#python3 -m pip install pymupdf unstructured pdfminer
	#chmod +x pdf2text.py
	#python3 pdf2text.py path/to/your/pdf/file.pdf -m unstructured -o output_file

	import warnings
	import argparse
	import os
	import importlib
	import sys
	#import pymupdf
	#from unstructured.partition.auto import partition
	#from pdfminer.high_level import extract_text

	def extract_text_pdfminer(file_path):
	extract_text_module = importlib.import_module("pdfminer.high_level")
	extract_text = getattr(extract_text_module, "extract_text")
	"""
	Extracts text from a PDF file using PDFMiner.

	Args:
	file_path (str): The path of the PDF file from which to extract text.

	Returns:
	str: The text extracted from the PDF file.
	"""
	# Use the extract_text function from PDFMiner to extract the text
	text = extract_text(file_path)
	return text

	def extract_text_from_pdf_unstructured(pdf_path):
	with warnings.catch_warnings():
	warnings.simplefilter("ignore")
	partition_module = importlib.import_module("unstructured.partition.auto")
	partition = getattr(partition_module, "partition")
	elements = partition(pdf_path, strategy="hi_res")
	text = "\n".join([str(el) for el in elements])
	return text

	def extract_text_from_pdf_pymupdf(pdf_path):
	pymupdf = importlib.import_module("fitz")
	doc = pymupdf.open(pdf_path)
	print(f"Number of pages: {len(doc)}")
	text = ""
	for page in doc:
	text += page.get_text()
	return text

	def save_text(text, base_file_path):
	index = 1
	file_path = base_file_path
	while os.path.exists(file_path + ".txt"):
	file_path = f"{base_file_path}_{index}"
	index += 1
	file_path += ".txt"
	with open(file_path, "w") as file:
	file.write(text)
	print(f"Text saved in: {file_path}")

	def main():
	parser = argparse.ArgumentParser(description="Extracts text from a PDF file and saves it to a text file.")
	parser.add_argument("pdf_path", help="The path of the PDF file from which to extract text.")
	parser.add_argument("-o", "--output", help="The path of the output text file. If not specified, the text will be printed to the terminal.")
	parser.add_argument("-m", "--method", choices=["unstructured", "pdfminer", "pymupdf"], default="unstructured", help="The text extraction method: 'unstructured' (default) or 'pdfminer' or 'pymupdf'.")

	args = parser.parse_args()

	if args.method == "unstructured":
	text = extract_text_from_pdf_unstructured(args.pdf_path)
	elif args.method == "pdfminer":
	text = extract_text_pdfminer(args.pdf_path)
	elif args.method == "pymupdf":
	text = extract_text_from_pdf_pymupdf(args.pdf_path)
	else:
	print(f"Unsupported engine: {args.method}")
	sys.exit(1)

	if args.output:
	save_text(text, args.output)
	else:
	print(text)

	if __name__ == "__main__":
	main()