Skip to content

Commit

Permalink
Refactor the codebase
Browse files Browse the repository at this point in the history
  • Loading branch information
has-abi committed Feb 10, 2024
1 parent 8b0819f commit 08fc7a5
Show file tree
Hide file tree
Showing 11 changed files with 73 additions and 77 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,7 @@ ENV/
.pytest_cache

# html coverage
htmlcov/
htmlcov/

# Pycharm
.idea
3 changes: 1 addition & 2 deletions docparser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@


from io import BufferedReader
from typing import Union

from docparser.document import Document
from docparser.parser import Parser
Expand All @@ -14,7 +13,7 @@
from docparser.xml_parser import XMLParser


def parse(input_file: Union[str, BufferedReader]) -> Document:
def parse(input_file: str | BufferedReader) -> Document:
file_name, file_ext = get_file_name_and_ext(input_file)
reader = Reader(input_file, file_ext)
file_parser = XMLParser(reader.zip_file)
Expand Down
2 changes: 1 addition & 1 deletion docparser/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ class Document:
name: str
ext: str
content: str
splitted_content: Dict[str, str]
divided_content: Dict[str, str]
13 changes: 4 additions & 9 deletions docparser/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,23 @@
"""


class InvalidArgumentTypeException(Exception):
class InvalidArgumentTypeError(Exception):
def __init__(self, message: str) -> None:
super().__init__(message)


class FileNotFoundException(Exception):
def __init__(self, message: str) -> None:
super().__init__(message)


class UnsupportedFileFormatException(Exception):
class UnsupportedFileFormatError(Exception):
def __init__(self, file_format: str) -> None:
super().__init__(
f"{file_format} if not supported. supported formats are docx and doc."
)


class MissingAttributeException(Exception):
class MissingAttributeError(Exception):
def __init__(self, message: str) -> None:
super().__init__(message)


class InvalidReturnValueException(Exception):
class InvalidReturnValueError(Exception):
def __init__(self, message: str) -> None:
super().__init__(message)
18 changes: 9 additions & 9 deletions docparser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from typing import Any

from docparser.document import Document
from docparser.exceptions import InvalidReturnValueException, MissingAttributeException
from docparser.exceptions import InvalidReturnValueError, MissingAttributeError


class Parser:
Expand Down Expand Up @@ -48,13 +48,13 @@ def __check(self, file_parser: Any) -> None:
file_parser (Any): A file parser.
Raises:
MissingAttributeException: Thrown if the file parser don't have
MissingAttributeError: Thrown if the file parser don't have
a callable `extract_text`
"""
if not (
hasattr(file_parser, "extract_text") and callable(file_parser.extract_text)
):
raise MissingAttributeException(
raise MissingAttributeError(
"Missing callable extract_text() from file_parser instance."
)

Expand All @@ -66,21 +66,21 @@ def get_document(self, file_ext: str, file_name: str) -> Document:
file_name (str): The original file name.
Raises:
InvalidValueException: throw if the file parser callable
InvalidReturnValueError: throw if the file parser callable
`extract_text` return value is not a dict.
Returns:
Document: A document object that represents the parsed results.
"""
splitted_content = self.file_parser.extract_text()
if not isinstance(splitted_content, dict):
raise InvalidReturnValueException(
divided_content = self.file_parser.extract_text()
if not isinstance(divided_content, dict):
raise InvalidReturnValueError(
"The file parser extract_text callable return value must be a dict"
)
content = " ".join(list(splitted_content.values()))
content = " ".join(list(divided_content.values()))
return Document(
name=file_name,
ext=file_ext,
content=content,
splitted_content=splitted_content,
divided_content=divided_content,
)
30 changes: 14 additions & 16 deletions docparser/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,65 +12,63 @@

import os
from io import BufferedReader
from typing import Union
from zipfile import ZipFile

import docparser.constants as CS
from docparser.exceptions import (
FileNotFoundException,
InvalidArgumentTypeException,
UnsupportedFileFormatException,
InvalidArgumentTypeError,
UnsupportedFileFormatError,
)


class Reader:
"""Docparser `Reader` class that reads a docx file as a zip file.
Args:
input_file (Union[str, BufferedReader]): Input file that could be a file
input_file (str | BufferedReader): Input file that could be a file
or a file path.
file_ext (str): The input file extension.
"""

def __init__(self, input_file: Union[str, BufferedReader], file_ext: str) -> None:
def __init__(self, input_file: str | BufferedReader, file_ext: str) -> None:
"""Docparser `Reader` class that reads a docx file as a zip file.
Args:
input_file (Union[str, BufferedReader]): Input file that could be a file
input_file (str | BufferedReader): Input file that could be a file
or a file path.
file_ext (str): The input file extension.
"""
self.__check(input_file, file_ext)
self.input_file = input_file
self.zip_file = self.to_zip()

def __check(self, input_file: Union[str, BufferedReader], file_ext: str) -> None:
"""Check the input arguments of the class constuctor for invalid
def __check(self, input_file: str | BufferedReader, file_ext: str) -> None:
"""Check the input arguments of the class constructor for invalid
types or values.
Args:
input_file (Union[str, BufferedReader]): Input file that could be a file
input_file (str | BufferedReade): Input file that could be a file
or a file path.
file_ext (str): The input file extension.
Raises:
InvalidArgumentTypeException: Thrown if any argument has an invalid
InvalidArgumentTypeError: Thrown if any argument has an invalid
type.
UnsupportedFileFormatException: Thrown if the input file has unsupported
UnsupportedFileFormatError: Thrown if the input file has unsupported
format.
FileNotFoundException: Thrown if the input file don't exist in disque or
FileNotFoundError: Thrown if the input file don't exist in disque or
not found.
"""
if not isinstance(input_file, (str, BufferedReader)):
raise InvalidArgumentTypeException(
raise InvalidArgumentTypeError(
"input_file must be a file path or a binary file."
)

if file_ext not in CS.ALLOWED_EXTS:
raise UnsupportedFileFormatException(file_ext)
raise UnsupportedFileFormatError(file_ext)

if isinstance(input_file, str) and not os.path.isfile(input_file):
raise FileNotFoundException(f"File not found: {input_file}")
raise FileNotFoundError(f"File not found: {input_file}")

def to_zip(self) -> ZipFile:
"""Convert the input file to a zip file.
Expand Down
12 changes: 6 additions & 6 deletions docparser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,31 @@

import os
from io import BufferedReader
from typing import Tuple, Union
from typing import Tuple


def get_file_name_and_ext(
file_or_filepath: Union[str, BufferedReader]
file_or_filepath: str | BufferedReader
) -> Tuple[str, str]:
"""Extract the file extension and the file name
from a file or a file name.
Args:
file_or_filepath (Union[str, BufferedReader]): File or file path.
file_or_filepath (str | BufferedReader): File or file path.
Returns:
Tuple[str, str]: Tuple of file name and file extension
"""
filename = get_file_name(file_or_filepath)
ext = filename.rsplit(".", 1)[1]
return (filename, ext.lower())
return filename, ext.lower()


def get_file_name(file_or_filepath: Union[str, BufferedReader]) -> str:
def get_file_name(file_or_filepath: str | BufferedReader) -> str:
"""Extract the file name form a file or a file path.
Args:
file_or_filepath (Union[str, BufferedReader]): File or a file path.
file_or_filepath (str | BufferedReader): File or a file path.
Returns:
str: The extracted file name.
Expand Down
36 changes: 19 additions & 17 deletions docparser/xml_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,28 @@


import re
import xml.etree.ElementTree as ET
from typing import Dict, List, Union
import xml.etree.ElementTree as ETree
from typing import Dict, List
from zipfile import ZipFile

import docparser.constants as CS
from docparser.enums import LayoutEnum, TagEnum
from docparser.exceptions import InvalidArgumentTypeException
from docparser.exceptions import InvalidArgumentTypeError


XML_Type = Dict[str, bytes | List[bytes]]


class XMLParser:
"""Docpatser `XMLParser` class that parses the input zip file
"""Docparser `XMLParser` class that parses the input zip file
using the python package `xml`.
Args:
input_file (ZipFile): Zip file.
"""

def __init__(self, input_file: ZipFile) -> None:
"""Docpatser `XMLParser` class that parses the input zip file
"""Docparser `XMLParser` class that parses the input zip file
using the python package `xml`.
Args:
Expand All @@ -41,24 +44,24 @@ def __init__(self, input_file: ZipFile) -> None:
self.__name_list = self.__zip_file.namelist()

def __check(self, input_file: ZipFile) -> None:
"""Check the input arguments of the class constuctor for invalid
"""Check the input arguments of the class constructor for invalid
types or values.
Args:
input_file (ZipFile): Zip file.
Raises:
InvalidArgumentTypeException: Thrown if the input file is not an
InvalidArgumentTypeError: Thrown if the input file is not an
instance of ZipFile.
"""
if not isinstance(input_file, ZipFile):
raise InvalidArgumentTypeException("input file must of type ZipFile.")
raise InvalidArgumentTypeError("input file must of type ZipFile.")

def extract_text(self) -> Dict[str, str]:
"""Extract text from the zip file using XML.
Returns:
Dict[str, str]: A dictionnary containing the document
Dict[str, str]: A dictionary containing the document
XML parts [head, body, footer] and their text.
"""
doc_text: Dict[str, str] = {}
Expand All @@ -73,7 +76,7 @@ def extract_text(self) -> Dict[str, str]:
return doc_text

def xml2text(self, xml_part: bytes) -> str:
"""Extract text from an xml component nodes.
"""Extract text from xml component nodes.
Args:
xml_part (bytes): XML component.
Expand All @@ -82,7 +85,7 @@ def xml2text(self, xml_part: bytes) -> str:
str: The extracted text.
"""
text = ""
root = ET.fromstring(xml_part)
root = ETree.fromstring(xml_part)
for child in root.iter():
if child.tag == TagEnum.SPACE:
text += child.text if child.text is not None else ""
Expand All @@ -97,17 +100,16 @@ def xml2text(self, xml_part: bytes) -> str:
text += LayoutEnum.MAJ_BREAK_LINE
return text

def to_xml(self) -> Dict[str, Union[bytes, List[bytes]]]:
def to_xml(self) -> XML_Type:
"""Convert a zip file to XML components header, body and footer.
Returns:
Dict[str, Union[bytes, List[bytes]]]: Dictionnary containing
XML_Type: Dictionary containing
the components content.
"""
xml_parts: Dict[str, Union[bytes, List[bytes]]] = {}
xml_parts["header"] = self.get_xml_part_by_pattern(CS.XML_HEADER)
xml_parts["body"] = self.__zip_file.read(CS.XML_BODY)
xml_parts["footer"] = self.get_xml_part_by_pattern(CS.XML_FOOTER)
xml_parts: XML_Type = {"header": self.get_xml_part_by_pattern(CS.XML_HEADER),
"body": self.__zip_file.read(CS.XML_BODY),
"footer": self.get_xml_part_by_pattern(CS.XML_FOOTER)}
return xml_parts

def get_xml_part_by_pattern(self, pattern: str) -> List[bytes]:
Expand Down
12 changes: 6 additions & 6 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from unittest.mock import Mock

from docparser.document import Document
from docparser.exceptions import InvalidReturnValueException, MissingAttributeException
from docparser.exceptions import InvalidReturnValueError, MissingAttributeError
from docparser.parser import Parser


Expand All @@ -25,26 +25,26 @@ def setUpClass(cls) -> None:

def test_parser_with_invalid_file_parser(self):
test_file_parser = ""
with self.assertRaises(MissingAttributeException):
with self.assertRaises(MissingAttributeError):
Parser(file_parser=test_file_parser, file_ext="", file_name="")

def test_invalid_file_parser_extract_text_callable_return(self):
test_file_parser = Mock()
test_file_parser.extract_text = Mock(return_value=["list item"])
with self.assertRaises(InvalidReturnValueException):
with self.assertRaises(InvalidReturnValueError):
Parser(file_parser=test_file_parser, file_ext="", file_name="")

def test_get_document(self):
result_document = __class__.parser.document
self.assertTrue(isinstance(result_document, Document))
self.assertEqual(result_document.name, "file_name_example.docx")
self.assertEqual(result_document.ext, "docx")
self.assertTrue(isinstance(result_document.splitted_content, dict))
self.assertTrue(isinstance(result_document.divided_content, dict))
self.assertListEqual(
list(result_document.splitted_content.keys()), ["header", "body", "footer"]
list(result_document.divided_content.keys()), ["header", "body", "footer"]
)
self.assertListEqual(
list(result_document.splitted_content.values()),
list(result_document.divided_content.values()),
["xml header text", "xml body text", "xml footer text"],
)
self.assertEqual(
Expand Down
Loading

0 comments on commit 08fc7a5

Please sign in to comment.