From 08fc7a55118fb4c1cfd3982f08a021a33ae83702 Mon Sep 17 00:00:00 2001
From: Hassane Abida <abidahass.uca@gmail.com>
Date: Sat, 10 Feb 2024 14:53:17 +0100
Subject: [PATCH 1/3] Refactor the codebase

---
 .gitignore               |  5 ++++-
 docparser/__init__.py    |  3 +--
 docparser/document.py    |  2 +-
 docparser/exceptions.py  | 13 ++++---------
 docparser/parser.py      | 18 +++++++++---------
 docparser/reader.py      | 30 ++++++++++++++----------------
 docparser/utils.py       | 12 ++++++------
 docparser/xml_parser.py  | 36 +++++++++++++++++++-----------------
 tests/test_parser.py     | 12 ++++++------
 tests/test_reader.py     | 15 +++++++--------
 tests/test_xml_parser.py |  4 ++--
 11 files changed, 73 insertions(+), 77 deletions(-)

diff --git a/.gitignore b/.gitignore
index d598c45..74c7e9a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -95,4 +95,7 @@ ENV/
 .pytest_cache
 
 # html coverage
-htmlcov/
\ No newline at end of file
+htmlcov/
+
+# Pycharm
+.idea
\ No newline at end of file
diff --git a/docparser/__init__.py b/docparser/__init__.py
index 2145384..86130cf 100644
--- a/docparser/__init__.py
+++ b/docparser/__init__.py
@@ -5,7 +5,6 @@
 
 
 from io import BufferedReader
-from typing import Union
 
 from docparser.document import Document
 from docparser.parser import Parser
@@ -14,7 +13,7 @@
 from docparser.xml_parser import XMLParser
 
 
-def parse(input_file: Union[str, BufferedReader]) -> Document:
+def parse(input_file: str | BufferedReader) -> Document:
     file_name, file_ext = get_file_name_and_ext(input_file)
     reader = Reader(input_file, file_ext)
     file_parser = XMLParser(reader.zip_file)
diff --git a/docparser/document.py b/docparser/document.py
index e9fef71..43a6288 100644
--- a/docparser/document.py
+++ b/docparser/document.py
@@ -19,4 +19,4 @@ class Document:
     name: str
     ext: str
     content: str
-    splitted_content: Dict[str, str]
+    divided_content: Dict[str, str]
diff --git a/docparser/exceptions.py b/docparser/exceptions.py
index 2498808..0d82968 100644
--- a/docparser/exceptions.py
+++ b/docparser/exceptions.py
@@ -9,28 +9,23 @@
 """
 
 
-class InvalidArgumentTypeException(Exception):
+class InvalidArgumentTypeError(Exception):
     def __init__(self, message: str) -> None:
         super().__init__(message)
 
 
-class FileNotFoundException(Exception):
-    def __init__(self, message: str) -> None:
-        super().__init__(message)
-
-
-class UnsupportedFileFormatException(Exception):
+class UnsupportedFileFormatError(Exception):
     def __init__(self, file_format: str) -> None:
         super().__init__(
             f"{file_format} if not supported. supported formats are docx and doc."
         )
 
 
-class MissingAttributeException(Exception):
+class MissingAttributeError(Exception):
     def __init__(self, message: str) -> None:
         super().__init__(message)
 
 
-class InvalidReturnValueException(Exception):
+class InvalidReturnValueError(Exception):
     def __init__(self, message: str) -> None:
         super().__init__(message)
diff --git a/docparser/parser.py b/docparser/parser.py
index 33a6087..f98dead 100644
--- a/docparser/parser.py
+++ b/docparser/parser.py
@@ -14,7 +14,7 @@
 from typing import Any
 
 from docparser.document import Document
-from docparser.exceptions import InvalidReturnValueException, MissingAttributeException
+from docparser.exceptions import InvalidReturnValueError, MissingAttributeError
 
 
 class Parser:
@@ -48,13 +48,13 @@ def __check(self, file_parser: Any) -> None:
             file_parser (Any): A file parser.
 
         Raises:
-            MissingAttributeException: Thrown if the file parser don't have
+            MissingAttributeError: Thrown if the file parser don't have
                 a callable `extract_text`
         """
         if not (
             hasattr(file_parser, "extract_text") and callable(file_parser.extract_text)
         ):
-            raise MissingAttributeException(
+            raise MissingAttributeError(
                 "Missing callable extract_text() from file_parser instance."
             )
 
@@ -66,21 +66,21 @@ def get_document(self, file_ext: str, file_name: str) -> Document:
             file_name (str): The original file name.
 
         Raises:
-            InvalidValueException: throw if the file parser callable
+            InvalidReturnValueError: throw if the file parser callable
                 `extract_text` return value is not a dict.
 
         Returns:
             Document: A document object that represents the parsed results.
         """
-        splitted_content = self.file_parser.extract_text()
-        if not isinstance(splitted_content, dict):
-            raise InvalidReturnValueException(
+        divided_content = self.file_parser.extract_text()
+        if not isinstance(divided_content, dict):
+            raise InvalidReturnValueError(
                 "The file parser extract_text callable return value must be a dict"
             )
-        content = " ".join(list(splitted_content.values()))
+        content = " ".join(list(divided_content.values()))
         return Document(
             name=file_name,
             ext=file_ext,
             content=content,
-            splitted_content=splitted_content,
+            divided_content=divided_content,
         )
diff --git a/docparser/reader.py b/docparser/reader.py
index bfaf38f..fadbac7 100644
--- a/docparser/reader.py
+++ b/docparser/reader.py
@@ -12,14 +12,12 @@
 
 import os
 from io import BufferedReader
-from typing import Union
 from zipfile import ZipFile
 
 import docparser.constants as CS
 from docparser.exceptions import (
-    FileNotFoundException,
-    InvalidArgumentTypeException,
-    UnsupportedFileFormatException,
+    InvalidArgumentTypeError,
+    UnsupportedFileFormatError,
 )
 
 
@@ -27,16 +25,16 @@ class Reader:
     """Docparser `Reader` class that reads a docx file as a zip file.
 
     Args:
-        input_file (Union[str, BufferedReader]): Input file that could be a file
+        input_file (str | BufferedReader): Input file that could be a file
             or a file path.
         file_ext (str): The input file extension.
     """
 
-    def __init__(self, input_file: Union[str, BufferedReader], file_ext: str) -> None:
+    def __init__(self, input_file: str | BufferedReader, file_ext: str) -> None:
         """Docparser `Reader` class that reads a docx file as a zip file.
 
         Args:
-            input_file (Union[str, BufferedReader]): Input file that could be a file
+            input_file (str | BufferedReader): Input file that could be a file
                 or a file path.
             file_ext (str): The input file extension.
         """
@@ -44,33 +42,33 @@ def __init__(self, input_file: Union[str, BufferedReader], file_ext: str) -> Non
         self.input_file = input_file
         self.zip_file = self.to_zip()
 
-    def __check(self, input_file: Union[str, BufferedReader], file_ext: str) -> None:
-        """Check the input arguments of the class constuctor for invalid
+    def __check(self, input_file: str | BufferedReader, file_ext: str) -> None:
+        """Check the input arguments of the class constructor for invalid
         types or values.
 
         Args:
-            input_file (Union[str, BufferedReader]): Input file that could be a file
+            input_file (str | BufferedReade): Input file that could be a file
                 or a file path.
             file_ext (str): The input file extension.
 
         Raises:
-            InvalidArgumentTypeException: Thrown if any argument has an invalid
+            InvalidArgumentTypeError: Thrown if any argument has an invalid
                 type.
-            UnsupportedFileFormatException: Thrown if the input file has unsupported
+            UnsupportedFileFormatError: Thrown if the input file has unsupported
                 format.
-            FileNotFoundException: Thrown if the input file don't exist in disque or
+            FileNotFoundError: Thrown if the input file don't exist in disque or
                 not found.
         """
         if not isinstance(input_file, (str, BufferedReader)):
-            raise InvalidArgumentTypeException(
+            raise InvalidArgumentTypeError(
                 "input_file must be a file path or a binary file."
             )
 
         if file_ext not in CS.ALLOWED_EXTS:
-            raise UnsupportedFileFormatException(file_ext)
+            raise UnsupportedFileFormatError(file_ext)
 
         if isinstance(input_file, str) and not os.path.isfile(input_file):
-            raise FileNotFoundException(f"File not found: {input_file}")
+            raise FileNotFoundError(f"File not found: {input_file}")
 
     def to_zip(self) -> ZipFile:
         """Convert the input file to a zip file.
diff --git a/docparser/utils.py b/docparser/utils.py
index ab64ff0..cac0a4e 100644
--- a/docparser/utils.py
+++ b/docparser/utils.py
@@ -11,31 +11,31 @@
 
 import os
 from io import BufferedReader
-from typing import Tuple, Union
+from typing import Tuple
 
 
 def get_file_name_and_ext(
-    file_or_filepath: Union[str, BufferedReader]
+    file_or_filepath: str | BufferedReader
 ) -> Tuple[str, str]:
     """Extract the file extension and the file name
     from a file or a file name.
 
     Args:
-        file_or_filepath (Union[str, BufferedReader]): File or file path.
+        file_or_filepath (str | BufferedReader): File or file path.
 
     Returns:
         Tuple[str, str]: Tuple of file name and file extension
     """
     filename = get_file_name(file_or_filepath)
     ext = filename.rsplit(".", 1)[1]
-    return (filename, ext.lower())
+    return filename, ext.lower()
 
 
-def get_file_name(file_or_filepath: Union[str, BufferedReader]) -> str:
+def get_file_name(file_or_filepath: str | BufferedReader) -> str:
     """Extract the file name form a file or a file path.
 
     Args:
-        file_or_filepath (Union[str, BufferedReader]): File or a file path.
+        file_or_filepath (str | BufferedReader): File or a file path.
 
     Returns:
         str: The extracted file name.
diff --git a/docparser/xml_parser.py b/docparser/xml_parser.py
index 2559065..87cd5d3 100644
--- a/docparser/xml_parser.py
+++ b/docparser/xml_parser.py
@@ -12,17 +12,20 @@
 
 
 import re
-import xml.etree.ElementTree as ET
-from typing import Dict, List, Union
+import xml.etree.ElementTree as ETree
+from typing import Dict, List
 from zipfile import ZipFile
 
 import docparser.constants as CS
 from docparser.enums import LayoutEnum, TagEnum
-from docparser.exceptions import InvalidArgumentTypeException
+from docparser.exceptions import InvalidArgumentTypeError
+
+
+XML_Type = Dict[str, bytes | List[bytes]]
 
 
 class XMLParser:
-    """Docpatser `XMLParser` class that parses the input zip file
+    """Docparser `XMLParser` class that parses the input zip file
     using the python package `xml`.
 
     Args:
@@ -30,7 +33,7 @@ class XMLParser:
     """
 
     def __init__(self, input_file: ZipFile) -> None:
-        """Docpatser `XMLParser` class that parses the input zip file
+        """Docparser `XMLParser` class that parses the input zip file
         using the python package `xml`.
 
         Args:
@@ -41,24 +44,24 @@ def __init__(self, input_file: ZipFile) -> None:
         self.__name_list = self.__zip_file.namelist()
 
     def __check(self, input_file: ZipFile) -> None:
-        """Check the input arguments of the class constuctor for invalid
+        """Check the input arguments of the class constructor for invalid
         types or values.
 
         Args:
             input_file (ZipFile): Zip file.
 
         Raises:
-            InvalidArgumentTypeException: Thrown if the input file is not an
+            InvalidArgumentTypeError: Thrown if the input file is not an
                 instance of ZipFile.
         """
         if not isinstance(input_file, ZipFile):
-            raise InvalidArgumentTypeException("input file must of type ZipFile.")
+            raise InvalidArgumentTypeError("input file must of type ZipFile.")
 
     def extract_text(self) -> Dict[str, str]:
         """Extract text from the zip file using XML.
 
         Returns:
-            Dict[str, str]: A dictionnary containing the document
+            Dict[str, str]: A dictionary containing the document
                 XML parts [head, body, footer] and their text.
         """
         doc_text: Dict[str, str] = {}
@@ -73,7 +76,7 @@ def extract_text(self) -> Dict[str, str]:
         return doc_text
 
     def xml2text(self, xml_part: bytes) -> str:
-        """Extract text from an xml component nodes.
+        """Extract text from xml component nodes.
 
         Args:
             xml_part (bytes): XML component.
@@ -82,7 +85,7 @@ def xml2text(self, xml_part: bytes) -> str:
             str: The extracted text.
         """
         text = ""
-        root = ET.fromstring(xml_part)
+        root = ETree.fromstring(xml_part)
         for child in root.iter():
             if child.tag == TagEnum.SPACE:
                 text += child.text if child.text is not None else ""
@@ -97,17 +100,16 @@ def xml2text(self, xml_part: bytes) -> str:
                 text += LayoutEnum.MAJ_BREAK_LINE
         return text
 
-    def to_xml(self) -> Dict[str, Union[bytes, List[bytes]]]:
+    def to_xml(self) -> XML_Type:
         """Convert a zip file to XML components header, body and footer.
 
         Returns:
-            Dict[str, Union[bytes, List[bytes]]]: Dictionnary containing
+            XML_Type: Dictionary containing
                 the components content.
         """
-        xml_parts: Dict[str, Union[bytes, List[bytes]]] = {}
-        xml_parts["header"] = self.get_xml_part_by_pattern(CS.XML_HEADER)
-        xml_parts["body"] = self.__zip_file.read(CS.XML_BODY)
-        xml_parts["footer"] = self.get_xml_part_by_pattern(CS.XML_FOOTER)
+        xml_parts: XML_Type = {"header": self.get_xml_part_by_pattern(CS.XML_HEADER),
+                               "body": self.__zip_file.read(CS.XML_BODY),
+                               "footer": self.get_xml_part_by_pattern(CS.XML_FOOTER)}
         return xml_parts
 
     def get_xml_part_by_pattern(self, pattern: str) -> List[bytes]:
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 2f644d8..92009fc 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -2,7 +2,7 @@
 from unittest.mock import Mock
 
 from docparser.document import Document
-from docparser.exceptions import InvalidReturnValueException, MissingAttributeException
+from docparser.exceptions import InvalidReturnValueError, MissingAttributeError
 from docparser.parser import Parser
 
 
@@ -25,13 +25,13 @@ def setUpClass(cls) -> None:
 
     def test_parser_with_invalid_file_parser(self):
         test_file_parser = ""
-        with self.assertRaises(MissingAttributeException):
+        with self.assertRaises(MissingAttributeError):
             Parser(file_parser=test_file_parser, file_ext="", file_name="")
 
     def test_invalid_file_parser_extract_text_callable_return(self):
         test_file_parser = Mock()
         test_file_parser.extract_text = Mock(return_value=["list item"])
-        with self.assertRaises(InvalidReturnValueException):
+        with self.assertRaises(InvalidReturnValueError):
             Parser(file_parser=test_file_parser, file_ext="", file_name="")
 
     def test_get_document(self):
@@ -39,12 +39,12 @@ def test_get_document(self):
         self.assertTrue(isinstance(result_document, Document))
         self.assertEqual(result_document.name, "file_name_example.docx")
         self.assertEqual(result_document.ext, "docx")
-        self.assertTrue(isinstance(result_document.splitted_content, dict))
+        self.assertTrue(isinstance(result_document.divided_content, dict))
         self.assertListEqual(
-            list(result_document.splitted_content.keys()), ["header", "body", "footer"]
+            list(result_document.divided_content.keys()), ["header", "body", "footer"]
         )
         self.assertListEqual(
-            list(result_document.splitted_content.values()),
+            list(result_document.divided_content.values()),
             ["xml header text", "xml body text", "xml footer text"],
         )
         self.assertEqual(
diff --git a/tests/test_reader.py b/tests/test_reader.py
index 36477c8..8357373 100644
--- a/tests/test_reader.py
+++ b/tests/test_reader.py
@@ -3,9 +3,8 @@
 from zipfile import ZipFile
 
 from docparser.exceptions import (
-    FileNotFoundException,
-    InvalidArgumentTypeException,
-    UnsupportedFileFormatException,
+    InvalidArgumentTypeError,
+    UnsupportedFileFormatError,
 )
 from docparser.reader import Reader
 
@@ -14,16 +13,16 @@
 
 class TestReader(unittest.TestCase):
     def test_read_empty_file(self):
-        with self.assertRaises(InvalidArgumentTypeException):
+        with self.assertRaises(InvalidArgumentTypeError):
             reader = Reader(input_file=None, file_ext="")  # type: ignore
 
     def test_read_unsupported_file_type(self):
-        with self.assertRaises(UnsupportedFileFormatException):
+        with self.assertRaises(UnsupportedFileFormatError):
             reader = Reader(input_file="file_example.pdf", file_ext="pdf")
 
-    def test_read_inexistant_file(self):
-        with self.assertRaises(FileNotFoundException):
-            reader = Reader(input_file="inexistant_file.docx", file_ext="docx")
+    def test_read_missing_file(self):
+        with self.assertRaises(FileNotFoundError):
+            reader = Reader(input_file="missing_file.docx", file_ext="docx")
 
     def test_to_zip_str_file(self):
         test_reader = Reader(input_file=str(DOCX_FILE_PATH), file_ext="docx")
diff --git a/tests/test_xml_parser.py b/tests/test_xml_parser.py
index 51ae8eb..9e1dd97 100644
--- a/tests/test_xml_parser.py
+++ b/tests/test_xml_parser.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 from zipfile import ZipFile
 
-from docparser.exceptions import InvalidArgumentTypeException
+from docparser.exceptions import InvalidArgumentTypeError
 from docparser.xml_parser import XMLParser
 
 DOCX_FILE_PATH = Path(__file__).parent / "data" / "docx_example.docx"
@@ -19,7 +19,7 @@ def setUpClass(cls) -> None:
         cls.xml_parser = XMLParser(cls.zip_file)
 
     def test_invalid_input_file(self):
-        with self.assertRaises(InvalidArgumentTypeException):
+        with self.assertRaises(InvalidArgumentTypeError):
             xml_parser = XMLParser(input_file="")  # type: ignore
 
     def test_get_xml_part_by_pattern_header(self) -> None:

From 718c7868f7b011cd5a8f7cb12f3ea569890b470c Mon Sep 17 00:00:00 2001
From: Hassane Abida <abidahass.uca@gmail.com>
Date: Sat, 10 Feb 2024 15:05:06 +0100
Subject: [PATCH 2/3] Reformating

---
 docparser/utils.py      | 4 +---
 docparser/xml_parser.py | 8 +++++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docparser/utils.py b/docparser/utils.py
index cac0a4e..e1e9138 100644
--- a/docparser/utils.py
+++ b/docparser/utils.py
@@ -14,9 +14,7 @@
 from typing import Tuple
 
 
-def get_file_name_and_ext(
-    file_or_filepath: str | BufferedReader
-) -> Tuple[str, str]:
+def get_file_name_and_ext(file_or_filepath: str | BufferedReader) -> Tuple[str, str]:
     """Extract the file extension and the file name
     from a file or a file name.
 
diff --git a/docparser/xml_parser.py b/docparser/xml_parser.py
index 87cd5d3..6733c20 100644
--- a/docparser/xml_parser.py
+++ b/docparser/xml_parser.py
@@ -107,9 +107,11 @@ def to_xml(self) -> XML_Type:
             XML_Type: Dictionary containing
                 the components content.
         """
-        xml_parts: XML_Type = {"header": self.get_xml_part_by_pattern(CS.XML_HEADER),
-                               "body": self.__zip_file.read(CS.XML_BODY),
-                               "footer": self.get_xml_part_by_pattern(CS.XML_FOOTER)}
+        xml_parts: XML_Type = {
+            "header": self.get_xml_part_by_pattern(CS.XML_HEADER),
+            "body": self.__zip_file.read(CS.XML_BODY),
+            "footer": self.get_xml_part_by_pattern(CS.XML_FOOTER),
+        }
         return xml_parts
 
     def get_xml_part_by_pattern(self, pattern: str) -> List[bytes]:

From 540a2d4cb0cb055db3b1f9950e0421b5f1b2d0fe Mon Sep 17 00:00:00 2001
From: Hassane Abida <abidahass.uca@gmail.com>
Date: Sat, 10 Feb 2024 15:08:20 +0100
Subject: [PATCH 3/3] Update test workflow to work using Python 3.10

---
 .github/workflows/test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2656c9d..be6a430 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -9,10 +9,10 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
-    - name: Set up Python 3.9
+    - name: Set up Python 3.10
       uses: actions/setup-python@v3
       with:
-        python-version: 3.9
+        python-version: "3.10"
     
     - name: Install Python dependencies
       run: |
@@ -31,7 +31,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.10", "3.11"]
 
     steps:
     - uses: actions/checkout@v3