From 08fc7a55118fb4c1cfd3982f08a021a33ae83702 Mon Sep 17 00:00:00 2001 From: Hassane Abida Date: Sat, 10 Feb 2024 14:53:17 +0100 Subject: [PATCH 1/3] Refactor the codebase --- .gitignore | 5 ++++- docparser/__init__.py | 3 +-- docparser/document.py | 2 +- docparser/exceptions.py | 13 ++++--------- docparser/parser.py | 18 +++++++++--------- docparser/reader.py | 30 ++++++++++++++---------------- docparser/utils.py | 12 ++++++------ docparser/xml_parser.py | 36 +++++++++++++++++++----------------- tests/test_parser.py | 12 ++++++------ tests/test_reader.py | 15 +++++++-------- tests/test_xml_parser.py | 4 ++-- 11 files changed, 73 insertions(+), 77 deletions(-) diff --git a/.gitignore b/.gitignore index d598c45..74c7e9a 100644 --- a/.gitignore +++ b/.gitignore @@ -95,4 +95,7 @@ ENV/ .pytest_cache # html coverage -htmlcov/ \ No newline at end of file +htmlcov/ + +# Pycharm +.idea \ No newline at end of file diff --git a/docparser/__init__.py b/docparser/__init__.py index 2145384..86130cf 100644 --- a/docparser/__init__.py +++ b/docparser/__init__.py @@ -5,7 +5,6 @@ from io import BufferedReader -from typing import Union from docparser.document import Document from docparser.parser import Parser @@ -14,7 +13,7 @@ from docparser.xml_parser import XMLParser -def parse(input_file: Union[str, BufferedReader]) -> Document: +def parse(input_file: str | BufferedReader) -> Document: file_name, file_ext = get_file_name_and_ext(input_file) reader = Reader(input_file, file_ext) file_parser = XMLParser(reader.zip_file) diff --git a/docparser/document.py b/docparser/document.py index e9fef71..43a6288 100644 --- a/docparser/document.py +++ b/docparser/document.py @@ -19,4 +19,4 @@ class Document: name: str ext: str content: str - splitted_content: Dict[str, str] + divided_content: Dict[str, str] diff --git a/docparser/exceptions.py b/docparser/exceptions.py index 2498808..0d82968 100644 --- a/docparser/exceptions.py +++ b/docparser/exceptions.py @@ -9,28 +9,23 @@ """ -class InvalidArgumentTypeException(Exception): +class InvalidArgumentTypeError(Exception): def __init__(self, message: str) -> None: super().__init__(message) -class FileNotFoundException(Exception): - def __init__(self, message: str) -> None: - super().__init__(message) - - -class UnsupportedFileFormatException(Exception): +class UnsupportedFileFormatError(Exception): def __init__(self, file_format: str) -> None: super().__init__( f"{file_format} if not supported. supported formats are docx and doc." ) -class MissingAttributeException(Exception): +class MissingAttributeError(Exception): def __init__(self, message: str) -> None: super().__init__(message) -class InvalidReturnValueException(Exception): +class InvalidReturnValueError(Exception): def __init__(self, message: str) -> None: super().__init__(message) diff --git a/docparser/parser.py b/docparser/parser.py index 33a6087..f98dead 100644 --- a/docparser/parser.py +++ b/docparser/parser.py @@ -14,7 +14,7 @@ from typing import Any from docparser.document import Document -from docparser.exceptions import InvalidReturnValueException, MissingAttributeException +from docparser.exceptions import InvalidReturnValueError, MissingAttributeError class Parser: @@ -48,13 +48,13 @@ def __check(self, file_parser: Any) -> None: file_parser (Any): A file parser. Raises: - MissingAttributeException: Thrown if the file parser don't have + MissingAttributeError: Thrown if the file parser don't have a callable `extract_text` """ if not ( hasattr(file_parser, "extract_text") and callable(file_parser.extract_text) ): - raise MissingAttributeException( + raise MissingAttributeError( "Missing callable extract_text() from file_parser instance." ) @@ -66,21 +66,21 @@ def get_document(self, file_ext: str, file_name: str) -> Document: file_name (str): The original file name. Raises: - InvalidValueException: throw if the file parser callable + InvalidReturnValueError: throw if the file parser callable `extract_text` return value is not a dict. Returns: Document: A document object that represents the parsed results. """ - splitted_content = self.file_parser.extract_text() - if not isinstance(splitted_content, dict): - raise InvalidReturnValueException( + divided_content = self.file_parser.extract_text() + if not isinstance(divided_content, dict): + raise InvalidReturnValueError( "The file parser extract_text callable return value must be a dict" ) - content = " ".join(list(splitted_content.values())) + content = " ".join(list(divided_content.values())) return Document( name=file_name, ext=file_ext, content=content, - splitted_content=splitted_content, + divided_content=divided_content, ) diff --git a/docparser/reader.py b/docparser/reader.py index bfaf38f..fadbac7 100644 --- a/docparser/reader.py +++ b/docparser/reader.py @@ -12,14 +12,12 @@ import os from io import BufferedReader -from typing import Union from zipfile import ZipFile import docparser.constants as CS from docparser.exceptions import ( - FileNotFoundException, - InvalidArgumentTypeException, - UnsupportedFileFormatException, + InvalidArgumentTypeError, + UnsupportedFileFormatError, ) @@ -27,16 +25,16 @@ class Reader: """Docparser `Reader` class that reads a docx file as a zip file. Args: - input_file (Union[str, BufferedReader]): Input file that could be a file + input_file (str | BufferedReader): Input file that could be a file or a file path. file_ext (str): The input file extension. """ - def __init__(self, input_file: Union[str, BufferedReader], file_ext: str) -> None: + def __init__(self, input_file: str | BufferedReader, file_ext: str) -> None: """Docparser `Reader` class that reads a docx file as a zip file. Args: - input_file (Union[str, BufferedReader]): Input file that could be a file + input_file (str | BufferedReader): Input file that could be a file or a file path. file_ext (str): The input file extension. """ @@ -44,33 +42,33 @@ def __init__(self, input_file: Union[str, BufferedReader], file_ext: str) -> Non self.input_file = input_file self.zip_file = self.to_zip() - def __check(self, input_file: Union[str, BufferedReader], file_ext: str) -> None: - """Check the input arguments of the class constuctor for invalid + def __check(self, input_file: str | BufferedReader, file_ext: str) -> None: + """Check the input arguments of the class constructor for invalid types or values. Args: - input_file (Union[str, BufferedReader]): Input file that could be a file + input_file (str | BufferedReade): Input file that could be a file or a file path. file_ext (str): The input file extension. Raises: - InvalidArgumentTypeException: Thrown if any argument has an invalid + InvalidArgumentTypeError: Thrown if any argument has an invalid type. - UnsupportedFileFormatException: Thrown if the input file has unsupported + UnsupportedFileFormatError: Thrown if the input file has unsupported format. - FileNotFoundException: Thrown if the input file don't exist in disque or + FileNotFoundError: Thrown if the input file don't exist in disque or not found. """ if not isinstance(input_file, (str, BufferedReader)): - raise InvalidArgumentTypeException( + raise InvalidArgumentTypeError( "input_file must be a file path or a binary file." ) if file_ext not in CS.ALLOWED_EXTS: - raise UnsupportedFileFormatException(file_ext) + raise UnsupportedFileFormatError(file_ext) if isinstance(input_file, str) and not os.path.isfile(input_file): - raise FileNotFoundException(f"File not found: {input_file}") + raise FileNotFoundError(f"File not found: {input_file}") def to_zip(self) -> ZipFile: """Convert the input file to a zip file. diff --git a/docparser/utils.py b/docparser/utils.py index ab64ff0..cac0a4e 100644 --- a/docparser/utils.py +++ b/docparser/utils.py @@ -11,31 +11,31 @@ import os from io import BufferedReader -from typing import Tuple, Union +from typing import Tuple def get_file_name_and_ext( - file_or_filepath: Union[str, BufferedReader] + file_or_filepath: str | BufferedReader ) -> Tuple[str, str]: """Extract the file extension and the file name from a file or a file name. Args: - file_or_filepath (Union[str, BufferedReader]): File or file path. + file_or_filepath (str | BufferedReader): File or file path. Returns: Tuple[str, str]: Tuple of file name and file extension """ filename = get_file_name(file_or_filepath) ext = filename.rsplit(".", 1)[1] - return (filename, ext.lower()) + return filename, ext.lower() -def get_file_name(file_or_filepath: Union[str, BufferedReader]) -> str: +def get_file_name(file_or_filepath: str | BufferedReader) -> str: """Extract the file name form a file or a file path. Args: - file_or_filepath (Union[str, BufferedReader]): File or a file path. + file_or_filepath (str | BufferedReader): File or a file path. Returns: str: The extracted file name. diff --git a/docparser/xml_parser.py b/docparser/xml_parser.py index 2559065..87cd5d3 100644 --- a/docparser/xml_parser.py +++ b/docparser/xml_parser.py @@ -12,17 +12,20 @@ import re -import xml.etree.ElementTree as ET -from typing import Dict, List, Union +import xml.etree.ElementTree as ETree +from typing import Dict, List from zipfile import ZipFile import docparser.constants as CS from docparser.enums import LayoutEnum, TagEnum -from docparser.exceptions import InvalidArgumentTypeException +from docparser.exceptions import InvalidArgumentTypeError + + +XML_Type = Dict[str, bytes | List[bytes]] class XMLParser: - """Docpatser `XMLParser` class that parses the input zip file + """Docparser `XMLParser` class that parses the input zip file using the python package `xml`. Args: @@ -30,7 +33,7 @@ class XMLParser: """ def __init__(self, input_file: ZipFile) -> None: - """Docpatser `XMLParser` class that parses the input zip file + """Docparser `XMLParser` class that parses the input zip file using the python package `xml`. Args: @@ -41,24 +44,24 @@ def __init__(self, input_file: ZipFile) -> None: self.__name_list = self.__zip_file.namelist() def __check(self, input_file: ZipFile) -> None: - """Check the input arguments of the class constuctor for invalid + """Check the input arguments of the class constructor for invalid types or values. Args: input_file (ZipFile): Zip file. Raises: - InvalidArgumentTypeException: Thrown if the input file is not an + InvalidArgumentTypeError: Thrown if the input file is not an instance of ZipFile. """ if not isinstance(input_file, ZipFile): - raise InvalidArgumentTypeException("input file must of type ZipFile.") + raise InvalidArgumentTypeError("input file must of type ZipFile.") def extract_text(self) -> Dict[str, str]: """Extract text from the zip file using XML. Returns: - Dict[str, str]: A dictionnary containing the document + Dict[str, str]: A dictionary containing the document XML parts [head, body, footer] and their text. """ doc_text: Dict[str, str] = {} @@ -73,7 +76,7 @@ def extract_text(self) -> Dict[str, str]: return doc_text def xml2text(self, xml_part: bytes) -> str: - """Extract text from an xml component nodes. + """Extract text from xml component nodes. Args: xml_part (bytes): XML component. @@ -82,7 +85,7 @@ def xml2text(self, xml_part: bytes) -> str: str: The extracted text. """ text = "" - root = ET.fromstring(xml_part) + root = ETree.fromstring(xml_part) for child in root.iter(): if child.tag == TagEnum.SPACE: text += child.text if child.text is not None else "" @@ -97,17 +100,16 @@ def xml2text(self, xml_part: bytes) -> str: text += LayoutEnum.MAJ_BREAK_LINE return text - def to_xml(self) -> Dict[str, Union[bytes, List[bytes]]]: + def to_xml(self) -> XML_Type: """Convert a zip file to XML components header, body and footer. Returns: - Dict[str, Union[bytes, List[bytes]]]: Dictionnary containing + XML_Type: Dictionary containing the components content. """ - xml_parts: Dict[str, Union[bytes, List[bytes]]] = {} - xml_parts["header"] = self.get_xml_part_by_pattern(CS.XML_HEADER) - xml_parts["body"] = self.__zip_file.read(CS.XML_BODY) - xml_parts["footer"] = self.get_xml_part_by_pattern(CS.XML_FOOTER) + xml_parts: XML_Type = {"header": self.get_xml_part_by_pattern(CS.XML_HEADER), + "body": self.__zip_file.read(CS.XML_BODY), + "footer": self.get_xml_part_by_pattern(CS.XML_FOOTER)} return xml_parts def get_xml_part_by_pattern(self, pattern: str) -> List[bytes]: diff --git a/tests/test_parser.py b/tests/test_parser.py index 2f644d8..92009fc 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2,7 +2,7 @@ from unittest.mock import Mock from docparser.document import Document -from docparser.exceptions import InvalidReturnValueException, MissingAttributeException +from docparser.exceptions import InvalidReturnValueError, MissingAttributeError from docparser.parser import Parser @@ -25,13 +25,13 @@ def setUpClass(cls) -> None: def test_parser_with_invalid_file_parser(self): test_file_parser = "" - with self.assertRaises(MissingAttributeException): + with self.assertRaises(MissingAttributeError): Parser(file_parser=test_file_parser, file_ext="", file_name="") def test_invalid_file_parser_extract_text_callable_return(self): test_file_parser = Mock() test_file_parser.extract_text = Mock(return_value=["list item"]) - with self.assertRaises(InvalidReturnValueException): + with self.assertRaises(InvalidReturnValueError): Parser(file_parser=test_file_parser, file_ext="", file_name="") def test_get_document(self): @@ -39,12 +39,12 @@ def test_get_document(self): self.assertTrue(isinstance(result_document, Document)) self.assertEqual(result_document.name, "file_name_example.docx") self.assertEqual(result_document.ext, "docx") - self.assertTrue(isinstance(result_document.splitted_content, dict)) + self.assertTrue(isinstance(result_document.divided_content, dict)) self.assertListEqual( - list(result_document.splitted_content.keys()), ["header", "body", "footer"] + list(result_document.divided_content.keys()), ["header", "body", "footer"] ) self.assertListEqual( - list(result_document.splitted_content.values()), + list(result_document.divided_content.values()), ["xml header text", "xml body text", "xml footer text"], ) self.assertEqual( diff --git a/tests/test_reader.py b/tests/test_reader.py index 36477c8..8357373 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -3,9 +3,8 @@ from zipfile import ZipFile from docparser.exceptions import ( - FileNotFoundException, - InvalidArgumentTypeException, - UnsupportedFileFormatException, + InvalidArgumentTypeError, + UnsupportedFileFormatError, ) from docparser.reader import Reader @@ -14,16 +13,16 @@ class TestReader(unittest.TestCase): def test_read_empty_file(self): - with self.assertRaises(InvalidArgumentTypeException): + with self.assertRaises(InvalidArgumentTypeError): reader = Reader(input_file=None, file_ext="") # type: ignore def test_read_unsupported_file_type(self): - with self.assertRaises(UnsupportedFileFormatException): + with self.assertRaises(UnsupportedFileFormatError): reader = Reader(input_file="file_example.pdf", file_ext="pdf") - def test_read_inexistant_file(self): - with self.assertRaises(FileNotFoundException): - reader = Reader(input_file="inexistant_file.docx", file_ext="docx") + def test_read_missing_file(self): + with self.assertRaises(FileNotFoundError): + reader = Reader(input_file="missing_file.docx", file_ext="docx") def test_to_zip_str_file(self): test_reader = Reader(input_file=str(DOCX_FILE_PATH), file_ext="docx") diff --git a/tests/test_xml_parser.py b/tests/test_xml_parser.py index 51ae8eb..9e1dd97 100644 --- a/tests/test_xml_parser.py +++ b/tests/test_xml_parser.py @@ -2,7 +2,7 @@ from pathlib import Path from zipfile import ZipFile -from docparser.exceptions import InvalidArgumentTypeException +from docparser.exceptions import InvalidArgumentTypeError from docparser.xml_parser import XMLParser DOCX_FILE_PATH = Path(__file__).parent / "data" / "docx_example.docx" @@ -19,7 +19,7 @@ def setUpClass(cls) -> None: cls.xml_parser = XMLParser(cls.zip_file) def test_invalid_input_file(self): - with self.assertRaises(InvalidArgumentTypeException): + with self.assertRaises(InvalidArgumentTypeError): xml_parser = XMLParser(input_file="") # type: ignore def test_get_xml_part_by_pattern_header(self) -> None: From 718c7868f7b011cd5a8f7cb12f3ea569890b470c Mon Sep 17 00:00:00 2001 From: Hassane Abida Date: Sat, 10 Feb 2024 15:05:06 +0100 Subject: [PATCH 2/3] Reformating --- docparser/utils.py | 4 +--- docparser/xml_parser.py | 8 +++++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docparser/utils.py b/docparser/utils.py index cac0a4e..e1e9138 100644 --- a/docparser/utils.py +++ b/docparser/utils.py @@ -14,9 +14,7 @@ from typing import Tuple -def get_file_name_and_ext( - file_or_filepath: str | BufferedReader -) -> Tuple[str, str]: +def get_file_name_and_ext(file_or_filepath: str | BufferedReader) -> Tuple[str, str]: """Extract the file extension and the file name from a file or a file name. diff --git a/docparser/xml_parser.py b/docparser/xml_parser.py index 87cd5d3..6733c20 100644 --- a/docparser/xml_parser.py +++ b/docparser/xml_parser.py @@ -107,9 +107,11 @@ def to_xml(self) -> XML_Type: XML_Type: Dictionary containing the components content. """ - xml_parts: XML_Type = {"header": self.get_xml_part_by_pattern(CS.XML_HEADER), - "body": self.__zip_file.read(CS.XML_BODY), - "footer": self.get_xml_part_by_pattern(CS.XML_FOOTER)} + xml_parts: XML_Type = { + "header": self.get_xml_part_by_pattern(CS.XML_HEADER), + "body": self.__zip_file.read(CS.XML_BODY), + "footer": self.get_xml_part_by_pattern(CS.XML_FOOTER), + } return xml_parts def get_xml_part_by_pattern(self, pattern: str) -> List[bytes]: From 540a2d4cb0cb055db3b1f9950e0421b5f1b2d0fe Mon Sep 17 00:00:00 2001 From: Hassane Abida Date: Sat, 10 Feb 2024 15:08:20 +0100 Subject: [PATCH 3/3] Update test workflow to work using Python 3.10 --- .github/workflows/test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2656c9d..be6a430 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -9,10 +9,10 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Set up Python 3.9 + - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: 3.9 + python-version: "3.10" - name: Install Python dependencies run: | @@ -31,7 +31,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.10", "3.11"] steps: - uses: actions/checkout@v3