Skip to content

Commit

Permalink
Merge pull request #1 from has-abi/codebase_refactoring
Browse files Browse the repository at this point in the history
Refactor the codebase
  • Loading branch information
has-abi authored Feb 10, 2024
2 parents 8b0819f + 540a2d4 commit 7d45535
Show file tree
Hide file tree
Showing 12 changed files with 78 additions and 82 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ jobs:
steps:
- uses: actions/checkout@v3

- name: Set up Python 3.9
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: 3.9
python-version: "3.10"

- name: Install Python dependencies
run: |
Expand All @@ -31,7 +31,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
python-version: ["3.10", "3.11"]

steps:
- uses: actions/checkout@v3
Expand Down
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,7 @@ ENV/
.pytest_cache

# html coverage
htmlcov/
htmlcov/

# Pycharm
.idea
3 changes: 1 addition & 2 deletions docparser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@


from io import BufferedReader
from typing import Union

from docparser.document import Document
from docparser.parser import Parser
Expand All @@ -14,7 +13,7 @@
from docparser.xml_parser import XMLParser


def parse(input_file: Union[str, BufferedReader]) -> Document:
def parse(input_file: str | BufferedReader) -> Document:
file_name, file_ext = get_file_name_and_ext(input_file)
reader = Reader(input_file, file_ext)
file_parser = XMLParser(reader.zip_file)
Expand Down
2 changes: 1 addition & 1 deletion docparser/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ class Document:
name: str
ext: str
content: str
splitted_content: Dict[str, str]
divided_content: Dict[str, str]
13 changes: 4 additions & 9 deletions docparser/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,23 @@
"""


class InvalidArgumentTypeException(Exception):
class InvalidArgumentTypeError(Exception):
def __init__(self, message: str) -> None:
super().__init__(message)


class FileNotFoundException(Exception):
def __init__(self, message: str) -> None:
super().__init__(message)


class UnsupportedFileFormatException(Exception):
class UnsupportedFileFormatError(Exception):
def __init__(self, file_format: str) -> None:
super().__init__(
f"{file_format} if not supported. supported formats are docx and doc."
)


class MissingAttributeException(Exception):
class MissingAttributeError(Exception):
def __init__(self, message: str) -> None:
super().__init__(message)


class InvalidReturnValueException(Exception):
class InvalidReturnValueError(Exception):
def __init__(self, message: str) -> None:
super().__init__(message)
18 changes: 9 additions & 9 deletions docparser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from typing import Any

from docparser.document import Document
from docparser.exceptions import InvalidReturnValueException, MissingAttributeException
from docparser.exceptions import InvalidReturnValueError, MissingAttributeError


class Parser:
Expand Down Expand Up @@ -48,13 +48,13 @@ def __check(self, file_parser: Any) -> None:
file_parser (Any): A file parser.
Raises:
MissingAttributeException: Thrown if the file parser don't have
MissingAttributeError: Thrown if the file parser don't have
a callable `extract_text`
"""
if not (
hasattr(file_parser, "extract_text") and callable(file_parser.extract_text)
):
raise MissingAttributeException(
raise MissingAttributeError(
"Missing callable extract_text() from file_parser instance."
)

Expand All @@ -66,21 +66,21 @@ def get_document(self, file_ext: str, file_name: str) -> Document:
file_name (str): The original file name.
Raises:
InvalidValueException: throw if the file parser callable
InvalidReturnValueError: throw if the file parser callable
`extract_text` return value is not a dict.
Returns:
Document: A document object that represents the parsed results.
"""
splitted_content = self.file_parser.extract_text()
if not isinstance(splitted_content, dict):
raise InvalidReturnValueException(
divided_content = self.file_parser.extract_text()
if not isinstance(divided_content, dict):
raise InvalidReturnValueError(
"The file parser extract_text callable return value must be a dict"
)
content = " ".join(list(splitted_content.values()))
content = " ".join(list(divided_content.values()))
return Document(
name=file_name,
ext=file_ext,
content=content,
splitted_content=splitted_content,
divided_content=divided_content,
)
30 changes: 14 additions & 16 deletions docparser/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,65 +12,63 @@

import os
from io import BufferedReader
from typing import Union
from zipfile import ZipFile

import docparser.constants as CS
from docparser.exceptions import (
FileNotFoundException,
InvalidArgumentTypeException,
UnsupportedFileFormatException,
InvalidArgumentTypeError,
UnsupportedFileFormatError,
)


class Reader:
"""Docparser `Reader` class that reads a docx file as a zip file.
Args:
input_file (Union[str, BufferedReader]): Input file that could be a file
input_file (str | BufferedReader): Input file that could be a file
or a file path.
file_ext (str): The input file extension.
"""

def __init__(self, input_file: Union[str, BufferedReader], file_ext: str) -> None:
def __init__(self, input_file: str | BufferedReader, file_ext: str) -> None:
"""Docparser `Reader` class that reads a docx file as a zip file.
Args:
input_file (Union[str, BufferedReader]): Input file that could be a file
input_file (str | BufferedReader): Input file that could be a file
or a file path.
file_ext (str): The input file extension.
"""
self.__check(input_file, file_ext)
self.input_file = input_file
self.zip_file = self.to_zip()

def __check(self, input_file: Union[str, BufferedReader], file_ext: str) -> None:
"""Check the input arguments of the class constuctor for invalid
def __check(self, input_file: str | BufferedReader, file_ext: str) -> None:
"""Check the input arguments of the class constructor for invalid
types or values.
Args:
input_file (Union[str, BufferedReader]): Input file that could be a file
input_file (str | BufferedReade): Input file that could be a file
or a file path.
file_ext (str): The input file extension.
Raises:
InvalidArgumentTypeException: Thrown if any argument has an invalid
InvalidArgumentTypeError: Thrown if any argument has an invalid
type.
UnsupportedFileFormatException: Thrown if the input file has unsupported
UnsupportedFileFormatError: Thrown if the input file has unsupported
format.
FileNotFoundException: Thrown if the input file don't exist in disque or
FileNotFoundError: Thrown if the input file don't exist in disque or
not found.
"""
if not isinstance(input_file, (str, BufferedReader)):
raise InvalidArgumentTypeException(
raise InvalidArgumentTypeError(
"input_file must be a file path or a binary file."
)

if file_ext not in CS.ALLOWED_EXTS:
raise UnsupportedFileFormatException(file_ext)
raise UnsupportedFileFormatError(file_ext)

if isinstance(input_file, str) and not os.path.isfile(input_file):
raise FileNotFoundException(f"File not found: {input_file}")
raise FileNotFoundError(f"File not found: {input_file}")

def to_zip(self) -> ZipFile:
"""Convert the input file to a zip file.
Expand Down
14 changes: 6 additions & 8 deletions docparser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,29 @@

import os
from io import BufferedReader
from typing import Tuple, Union
from typing import Tuple


def get_file_name_and_ext(
file_or_filepath: Union[str, BufferedReader]
) -> Tuple[str, str]:
def get_file_name_and_ext(file_or_filepath: str | BufferedReader) -> Tuple[str, str]:
"""Extract the file extension and the file name
from a file or a file name.
Args:
file_or_filepath (Union[str, BufferedReader]): File or file path.
file_or_filepath (str | BufferedReader): File or file path.
Returns:
Tuple[str, str]: Tuple of file name and file extension
"""
filename = get_file_name(file_or_filepath)
ext = filename.rsplit(".", 1)[1]
return (filename, ext.lower())
return filename, ext.lower()


def get_file_name(file_or_filepath: Union[str, BufferedReader]) -> str:
def get_file_name(file_or_filepath: str | BufferedReader) -> str:
"""Extract the file name form a file or a file path.
Args:
file_or_filepath (Union[str, BufferedReader]): File or a file path.
file_or_filepath (str | BufferedReader): File or a file path.
Returns:
str: The extracted file name.
Expand Down
38 changes: 21 additions & 17 deletions docparser/xml_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,28 @@


import re
import xml.etree.ElementTree as ET
from typing import Dict, List, Union
import xml.etree.ElementTree as ETree
from typing import Dict, List
from zipfile import ZipFile

import docparser.constants as CS
from docparser.enums import LayoutEnum, TagEnum
from docparser.exceptions import InvalidArgumentTypeException
from docparser.exceptions import InvalidArgumentTypeError


XML_Type = Dict[str, bytes | List[bytes]]


class XMLParser:
"""Docpatser `XMLParser` class that parses the input zip file
"""Docparser `XMLParser` class that parses the input zip file
using the python package `xml`.
Args:
input_file (ZipFile): Zip file.
"""

def __init__(self, input_file: ZipFile) -> None:
"""Docpatser `XMLParser` class that parses the input zip file
"""Docparser `XMLParser` class that parses the input zip file
using the python package `xml`.
Args:
Expand All @@ -41,24 +44,24 @@ def __init__(self, input_file: ZipFile) -> None:
self.__name_list = self.__zip_file.namelist()

def __check(self, input_file: ZipFile) -> None:
"""Check the input arguments of the class constuctor for invalid
"""Check the input arguments of the class constructor for invalid
types or values.
Args:
input_file (ZipFile): Zip file.
Raises:
InvalidArgumentTypeException: Thrown if the input file is not an
InvalidArgumentTypeError: Thrown if the input file is not an
instance of ZipFile.
"""
if not isinstance(input_file, ZipFile):
raise InvalidArgumentTypeException("input file must of type ZipFile.")
raise InvalidArgumentTypeError("input file must of type ZipFile.")

def extract_text(self) -> Dict[str, str]:
"""Extract text from the zip file using XML.
Returns:
Dict[str, str]: A dictionnary containing the document
Dict[str, str]: A dictionary containing the document
XML parts [head, body, footer] and their text.
"""
doc_text: Dict[str, str] = {}
Expand All @@ -73,7 +76,7 @@ def extract_text(self) -> Dict[str, str]:
return doc_text

def xml2text(self, xml_part: bytes) -> str:
"""Extract text from an xml component nodes.
"""Extract text from xml component nodes.
Args:
xml_part (bytes): XML component.
Expand All @@ -82,7 +85,7 @@ def xml2text(self, xml_part: bytes) -> str:
str: The extracted text.
"""
text = ""
root = ET.fromstring(xml_part)
root = ETree.fromstring(xml_part)
for child in root.iter():
if child.tag == TagEnum.SPACE:
text += child.text if child.text is not None else ""
Expand All @@ -97,17 +100,18 @@ def xml2text(self, xml_part: bytes) -> str:
text += LayoutEnum.MAJ_BREAK_LINE
return text

def to_xml(self) -> Dict[str, Union[bytes, List[bytes]]]:
def to_xml(self) -> XML_Type:
"""Convert a zip file to XML components header, body and footer.
Returns:
Dict[str, Union[bytes, List[bytes]]]: Dictionnary containing
XML_Type: Dictionary containing
the components content.
"""
xml_parts: Dict[str, Union[bytes, List[bytes]]] = {}
xml_parts["header"] = self.get_xml_part_by_pattern(CS.XML_HEADER)
xml_parts["body"] = self.__zip_file.read(CS.XML_BODY)
xml_parts["footer"] = self.get_xml_part_by_pattern(CS.XML_FOOTER)
xml_parts: XML_Type = {
"header": self.get_xml_part_by_pattern(CS.XML_HEADER),
"body": self.__zip_file.read(CS.XML_BODY),
"footer": self.get_xml_part_by_pattern(CS.XML_FOOTER),
}
return xml_parts

def get_xml_part_by_pattern(self, pattern: str) -> List[bytes]:
Expand Down
Loading

0 comments on commit 7d45535

Please sign in to comment.