Merge pull request #1 from has-abi/codebase_refactoring

Refactor the codebase
has-abi · Feb 10, 2024 · 7d45535 · 7d45535
2 parents 8b0819f + 540a2d4
commit 7d45535
Show file tree

Hide file tree

Showing 12 changed files with 78 additions and 82 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -9,10 +9,10 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
-    - name: Set up Python 3.9
+    - name: Set up Python 3.10
       uses: actions/setup-python@v3
       with:
-        python-version: 3.9
+        python-version: "3.10"
 
     - name: Install Python dependencies
       run: |
@@ -31,7 +31,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.10", "3.11"]
 
     steps:
     - uses: actions/checkout@v3

diff --git a/.gitignore b/.gitignore
@@ -95,4 +95,7 @@ ENV/
 .pytest_cache
 
 # html coverage
-htmlcov/
+htmlcov/
+
+# Pycharm
+.idea
diff --git a/docparser/__init__.py b/docparser/__init__.py
@@ -5,7 +5,6 @@
 
 
 from io import BufferedReader
-from typing import Union
 
 from docparser.document import Document
 from docparser.parser import Parser
@@ -14,7 +13,7 @@
 from docparser.xml_parser import XMLParser
 
 
-def parse(input_file: Union[str, BufferedReader]) -> Document:
+def parse(input_file: str | BufferedReader) -> Document:
     file_name, file_ext = get_file_name_and_ext(input_file)
     reader = Reader(input_file, file_ext)
     file_parser = XMLParser(reader.zip_file)

diff --git a/docparser/document.py b/docparser/document.py
@@ -19,4 +19,4 @@ class Document:
     name: str
     ext: str
     content: str
-    splitted_content: Dict[str, str]
+    divided_content: Dict[str, str]
diff --git a/docparser/exceptions.py b/docparser/exceptions.py
@@ -9,28 +9,23 @@
 """
 
 
-class InvalidArgumentTypeException(Exception):
+class InvalidArgumentTypeError(Exception):
     def __init__(self, message: str) -> None:
         super().__init__(message)
 
 
-class FileNotFoundException(Exception):
-    def __init__(self, message: str) -> None:
-        super().__init__(message)
-
-
-class UnsupportedFileFormatException(Exception):
+class UnsupportedFileFormatError(Exception):
     def __init__(self, file_format: str) -> None:
         super().__init__(
             f"{file_format} if not supported. supported formats are docx and doc."
         )
 
 
-class MissingAttributeException(Exception):
+class MissingAttributeError(Exception):
     def __init__(self, message: str) -> None:
         super().__init__(message)
 
 
-class InvalidReturnValueException(Exception):
+class InvalidReturnValueError(Exception):
     def __init__(self, message: str) -> None:
         super().__init__(message)
diff --git a/docparser/parser.py b/docparser/parser.py
@@ -14,7 +14,7 @@
 from typing import Any
 
 from docparser.document import Document
-from docparser.exceptions import InvalidReturnValueException, MissingAttributeException
+from docparser.exceptions import InvalidReturnValueError, MissingAttributeError
 
 
 class Parser:
@@ -48,13 +48,13 @@ def __check(self, file_parser: Any) -> None:
             file_parser (Any): A file parser.
 
         Raises:
-            MissingAttributeException: Thrown if the file parser don't have
+            MissingAttributeError: Thrown if the file parser don't have
                 a callable `extract_text`
         """
         if not (
             hasattr(file_parser, "extract_text") and callable(file_parser.extract_text)
         ):
-            raise MissingAttributeException(
+            raise MissingAttributeError(
                 "Missing callable extract_text() from file_parser instance."
             )
 
@@ -66,21 +66,21 @@ def get_document(self, file_ext: str, file_name: str) -> Document:
             file_name (str): The original file name.
 
         Raises:
-            InvalidValueException: throw if the file parser callable
+            InvalidReturnValueError: throw if the file parser callable
                 `extract_text` return value is not a dict.
 
         Returns:
             Document: A document object that represents the parsed results.
         """
-        splitted_content = self.file_parser.extract_text()
-        if not isinstance(splitted_content, dict):
-            raise InvalidReturnValueException(
+        divided_content = self.file_parser.extract_text()
+        if not isinstance(divided_content, dict):
+            raise InvalidReturnValueError(
                 "The file parser extract_text callable return value must be a dict"
             )
-        content = " ".join(list(splitted_content.values()))
+        content = " ".join(list(divided_content.values()))
         return Document(
             name=file_name,
             ext=file_ext,
             content=content,
-            splitted_content=splitted_content,
+            divided_content=divided_content,
         )
diff --git a/docparser/reader.py b/docparser/reader.py
@@ -12,65 +12,63 @@
 
 import os
 from io import BufferedReader
-from typing import Union
 from zipfile import ZipFile
 
 import docparser.constants as CS
 from docparser.exceptions import (
-    FileNotFoundException,
-    InvalidArgumentTypeException,
-    UnsupportedFileFormatException,
+    InvalidArgumentTypeError,
+    UnsupportedFileFormatError,
 )
 
 
 class Reader:
     """Docparser `Reader` class that reads a docx file as a zip file.
 
     Args:
-        input_file (Union[str, BufferedReader]): Input file that could be a file
+        input_file (str | BufferedReader): Input file that could be a file
             or a file path.
         file_ext (str): The input file extension.
     """
 
-    def __init__(self, input_file: Union[str, BufferedReader], file_ext: str) -> None:
+    def __init__(self, input_file: str | BufferedReader, file_ext: str) -> None:
         """Docparser `Reader` class that reads a docx file as a zip file.
 
         Args:
-            input_file (Union[str, BufferedReader]): Input file that could be a file
+            input_file (str | BufferedReader): Input file that could be a file
                 or a file path.
             file_ext (str): The input file extension.
         """
         self.__check(input_file, file_ext)
         self.input_file = input_file
         self.zip_file = self.to_zip()
 
-    def __check(self, input_file: Union[str, BufferedReader], file_ext: str) -> None:
-        """Check the input arguments of the class constuctor for invalid
+    def __check(self, input_file: str | BufferedReader, file_ext: str) -> None:
+        """Check the input arguments of the class constructor for invalid
         types or values.
 
         Args:
-            input_file (Union[str, BufferedReader]): Input file that could be a file
+            input_file (str | BufferedReade): Input file that could be a file
                 or a file path.
             file_ext (str): The input file extension.
 
         Raises:
-            InvalidArgumentTypeException: Thrown if any argument has an invalid
+            InvalidArgumentTypeError: Thrown if any argument has an invalid
                 type.
-            UnsupportedFileFormatException: Thrown if the input file has unsupported
+            UnsupportedFileFormatError: Thrown if the input file has unsupported
                 format.
-            FileNotFoundException: Thrown if the input file don't exist in disque or
+            FileNotFoundError: Thrown if the input file don't exist in disque or
                 not found.
         """
         if not isinstance(input_file, (str, BufferedReader)):
-            raise InvalidArgumentTypeException(
+            raise InvalidArgumentTypeError(
                 "input_file must be a file path or a binary file."
             )
 
         if file_ext not in CS.ALLOWED_EXTS:
-            raise UnsupportedFileFormatException(file_ext)
+            raise UnsupportedFileFormatError(file_ext)
 
         if isinstance(input_file, str) and not os.path.isfile(input_file):
-            raise FileNotFoundException(f"File not found: {input_file}")
+            raise FileNotFoundError(f"File not found: {input_file}")
 
     def to_zip(self) -> ZipFile:
         """Convert the input file to a zip file.

diff --git a/docparser/utils.py b/docparser/utils.py
@@ -11,31 +11,29 @@
 
 import os
 from io import BufferedReader
-from typing import Tuple, Union
+from typing import Tuple
 
 
-def get_file_name_and_ext(
-    file_or_filepath: Union[str, BufferedReader]
-) -> Tuple[str, str]:
+def get_file_name_and_ext(file_or_filepath: str | BufferedReader) -> Tuple[str, str]:
     """Extract the file extension and the file name
     from a file or a file name.
 
     Args:
-        file_or_filepath (Union[str, BufferedReader]): File or file path.
+        file_or_filepath (str | BufferedReader): File or file path.
 
     Returns:
         Tuple[str, str]: Tuple of file name and file extension
     """
     filename = get_file_name(file_or_filepath)
     ext = filename.rsplit(".", 1)[1]
-    return (filename, ext.lower())
+    return filename, ext.lower()
 
 
-def get_file_name(file_or_filepath: Union[str, BufferedReader]) -> str:
+def get_file_name(file_or_filepath: str | BufferedReader) -> str:
     """Extract the file name form a file or a file path.
 
     Args:
-        file_or_filepath (Union[str, BufferedReader]): File or a file path.
+        file_or_filepath (str | BufferedReader): File or a file path.
 
     Returns:
         str: The extracted file name.

diff --git a/docparser/xml_parser.py b/docparser/xml_parser.py
@@ -12,25 +12,28 @@
 
 
 import re
-import xml.etree.ElementTree as ET
-from typing import Dict, List, Union
+import xml.etree.ElementTree as ETree
+from typing import Dict, List
 from zipfile import ZipFile
 
 import docparser.constants as CS
 from docparser.enums import LayoutEnum, TagEnum
-from docparser.exceptions import InvalidArgumentTypeException
+from docparser.exceptions import InvalidArgumentTypeError
+
+
+XML_Type = Dict[str, bytes | List[bytes]]
 
 
 class XMLParser:
-    """Docpatser `XMLParser` class that parses the input zip file
+    """Docparser `XMLParser` class that parses the input zip file
     using the python package `xml`.
 
     Args:
         input_file (ZipFile): Zip file.
     """
 
     def __init__(self, input_file: ZipFile) -> None:
-        """Docpatser `XMLParser` class that parses the input zip file
+        """Docparser `XMLParser` class that parses the input zip file
         using the python package `xml`.
 
         Args:
@@ -41,24 +44,24 @@ def __init__(self, input_file: ZipFile) -> None:
         self.__name_list = self.__zip_file.namelist()
 
     def __check(self, input_file: ZipFile) -> None:
-        """Check the input arguments of the class constuctor for invalid
+        """Check the input arguments of the class constructor for invalid
         types or values.
 
         Args:
             input_file (ZipFile): Zip file.
 
         Raises:
-            InvalidArgumentTypeException: Thrown if the input file is not an
+            InvalidArgumentTypeError: Thrown if the input file is not an
                 instance of ZipFile.
         """
         if not isinstance(input_file, ZipFile):
-            raise InvalidArgumentTypeException("input file must of type ZipFile.")
+            raise InvalidArgumentTypeError("input file must of type ZipFile.")
 
     def extract_text(self) -> Dict[str, str]:
         """Extract text from the zip file using XML.
 
         Returns:
-            Dict[str, str]: A dictionnary containing the document
+            Dict[str, str]: A dictionary containing the document
                 XML parts [head, body, footer] and their text.
         """
         doc_text: Dict[str, str] = {}
@@ -73,7 +76,7 @@ def extract_text(self) -> Dict[str, str]:
         return doc_text
 
     def xml2text(self, xml_part: bytes) -> str:
-        """Extract text from an xml component nodes.
+        """Extract text from xml component nodes.
 
         Args:
             xml_part (bytes): XML component.
@@ -82,7 +85,7 @@ def xml2text(self, xml_part: bytes) -> str:
             str: The extracted text.
         """
         text = ""
-        root = ET.fromstring(xml_part)
+        root = ETree.fromstring(xml_part)
         for child in root.iter():
             if child.tag == TagEnum.SPACE:
                 text += child.text if child.text is not None else ""
@@ -97,17 +100,18 @@ def xml2text(self, xml_part: bytes) -> str:
                 text += LayoutEnum.MAJ_BREAK_LINE
         return text
 
-    def to_xml(self) -> Dict[str, Union[bytes, List[bytes]]]:
+    def to_xml(self) -> XML_Type:
         """Convert a zip file to XML components header, body and footer.
 
         Returns:
-            Dict[str, Union[bytes, List[bytes]]]: Dictionnary containing
+            XML_Type: Dictionary containing
                 the components content.
         """
-        xml_parts: Dict[str, Union[bytes, List[bytes]]] = {}
-        xml_parts["header"] = self.get_xml_part_by_pattern(CS.XML_HEADER)
-        xml_parts["body"] = self.__zip_file.read(CS.XML_BODY)
-        xml_parts["footer"] = self.get_xml_part_by_pattern(CS.XML_FOOTER)
+        xml_parts: XML_Type = {
+            "header": self.get_xml_part_by_pattern(CS.XML_HEADER),
+            "body": self.__zip_file.read(CS.XML_BODY),
+            "footer": self.get_xml_part_by_pattern(CS.XML_FOOTER),
+        }
         return xml_parts
 
     def get_xml_part_by_pattern(self, pattern: str) -> List[bytes]: