Last active
December 8, 2023 16:27
-
-
Save duylebkHCM/9a974a05e520b6a3c89861af24d690a0 to your computer and use it in GitHub Desktop.
Automatically split ICDAR proceedings into separated papers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import fitz | |
from fitz import Page | |
import argparse | |
import pandas as pd | |
from pathlib import Path | |
from collections import defaultdict | |
EXCLUDE_KEYWORD = [ | |
'Contents', | |
'Organization', | |
'Preface', | |
'Foreword', | |
'Author Index' | |
] | |
PAPER_KEYWORD_PTTN = [ | |
'Keywords:' | |
] | |
CLEAN_PTTN = re.compile(r'[^0-9a-zA-Z]') | |
def extract_keyword(first_page: Page): | |
page_blocks = first_page.get_text("blocks") | |
success=False | |
new_lst_kw=[] | |
for blk in page_blocks: | |
for kw in PAPER_KEYWORD_PTTN: | |
if blk[4].startswith(kw) and not success: | |
lst_kw = blk[4].replace(kw, '') | |
lst_kw = lst_kw.split('·') | |
for k in lst_kw: | |
k = k.strip().split('\n') | |
if len(k) > 1: | |
new_lst_kw += [item.strip() for item in k] | |
else: | |
new_lst_kw += [k[0].strip()] | |
success=True | |
if success: | |
break | |
if len(new_lst_kw) > 0: | |
return ','.join(new_lst_kw) | |
else: | |
return 'Not found' | |
def split_pdf_book(book_dirs, output_dir=None, pattern='*.pdf'): | |
if Path(book_dirs).is_file(): | |
book_paths = [Path(book_dirs)] | |
else: | |
book_paths = list(Path(book_dirs).rglob(pattern)) | |
statistic_info = { | |
'book_name': [], | |
'chapter_title': [], | |
'paper_title': [], | |
'original_title':[], | |
'keyword': [], | |
'save_path': [] | |
} | |
for book_path in book_paths: | |
paper_dir = Path(output_dir).joinpath(book_path.stem) | |
if not paper_dir.exists(): | |
paper_dir.mkdir(parents=True) | |
doc = fitz.Document(book_path.as_posix()) | |
toc = doc.get_toc(simple=False) | |
contents = [[int(t[0]), t[1].strip(), int(t[2])] for t in toc] | |
remain_content = list(filter(lambda content: not any(content[1].__contains__(key) for key in EXCLUDE_KEYWORD), contents)) | |
remain_content = sorted(remain_content, key=lambda content: content[2]) | |
first_level_idx = [idx for idx, item in enumerate(remain_content) if item[0] == 1] | |
chapter_idxs = [] | |
paper_idxs = [] | |
first_level_idx = first_level_idx + [len(remain_content)] | |
rel_first_level_idx = dict([(idx, i) for idx, i in zip(first_level_idx, range(len(first_level_idx)))]) | |
for idx, next_idx in list(zip(first_level_idx[:-1], first_level_idx[1:])): | |
if next_idx-idx == 1: | |
chapter_idxs.append(idx) | |
else: | |
paper_idxs.append(idx) | |
chapter_info = defaultdict(list) | |
for idx in chapter_idxs: | |
chapter_info[idx] = [] | |
chapter_idxs += [len(remain_content)] | |
paper_idxs += [len(remain_content)] | |
chpter_range = list(zip(chapter_idxs[:-1], chapter_idxs[1:])) | |
paper_range = list(zip(paper_idxs[:-1], paper_idxs[1:])) | |
for idx, next_idx in paper_range: | |
complete=False | |
for chapter_idx, next_chapter_idx in chpter_range: | |
if idx > chapter_idx and idx < next_chapter_idx and not complete: | |
start_paper_idx = idx | |
if rel_first_level_idx[idx] == rel_first_level_idx[next_chapter_idx]-1: | |
end_paper_idx = next_chapter_idx-1 | |
else: | |
end_paper_idx = next_idx - 1 | |
chapter_info[chapter_idx].append((start_paper_idx, end_paper_idx)) | |
complete=True | |
for chapter in chapter_info: | |
chapter_title = remain_content[chapter][1] | |
chapter_dir = paper_dir.joinpath(chapter_title) | |
if not chapter_dir.exists(): | |
chapter_dir.mkdir(parents=True) | |
for paper_range in chapter_info[chapter]: | |
paper_inst = fitz.open() | |
start_page, end_page = remain_content[paper_range[0]][2]-1, remain_content[paper_range[1]][2]-1 | |
extracted_keyword = extract_keyword(doc[start_page]) | |
paper_inst.insert_pdf(doc, from_page=start_page, to_page=end_page) | |
paper_name = remain_content[paper_range[0]][1] | |
clean_paper_name = re.sub(CLEAN_PTTN, '', paper_name) | |
clean_paper_name = '_'.join(clean_paper_name.split()) | |
save_path=chapter_dir.joinpath(clean_paper_name).with_suffix('.pdf') | |
paper_inst.save(save_path) | |
paper_inst.close() | |
statistic_info['original_title'].append(paper_name) | |
statistic_info['paper_title'].append(clean_paper_name) | |
statistic_info['chapter_title'].append(chapter_title) | |
statistic_info['book_name'].append(book_path.stem) | |
statistic_info['keyword'].append(extracted_keyword) | |
statistic_info['save_path'].append(save_path.as_posix()) | |
statistic_info_df = pd.DataFrame.from_dict(statistic_info) | |
statistic_info_df.to_csv(Path(output_dir).joinpath('report.csv'), index=False, sep='\t') | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--input_dir', default=None) | |
parser.add_argument('--output_dir', default=None) | |
args = parser.parse_args() | |
split_pdf_book(book_dirs=args.input_dir, output_dir=args.output_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment