Skip to content

Commit 3c72462

Browse files
BUG: Get font information more reliably when removing text (#3252)
1 parent 9c176ec commit 3c72462

File tree

2 files changed

+69
-17
lines changed

2 files changed

+69
-17
lines changed

pypdf/_writer.py

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2054,7 +2054,7 @@ def remove_objects_from_page(
20542054
text_filters: Properties of text to be deleted, if applicable. Optional.
20552055
This is a Python dictionary with the following properties:
20562056
2057-
* font_ids: List of font IDs (such as /F1 or /T1_0) to be deleted.
2057+
* font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted.
20582058
20592059
"""
20602060
if isinstance(to_delete, (list, tuple)):
@@ -2119,8 +2119,9 @@ def clean(
21192119
)
21202120
):
21212121
if (
2122-
not to_delete & ObjectDeletionFlag.TEXT
2123-
or (not font_ids_to_delete or font_id in font_ids_to_delete)
2122+
not to_delete & ObjectDeletionFlag.TEXT
2123+
or (to_delete & ObjectDeletionFlag.TEXT and not text_filters)
2124+
or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete)
21242125
):
21252126
del content.operations[i]
21262127
else:
@@ -2246,16 +2247,49 @@ def remove_text(self, font_names: Optional[List[str]] = None) -> None:
22462247
font_names = []
22472248

22482249
for page in self.pages:
2249-
font_ids = []
2250-
fonts = page.get("/Resources", {}).get("/Font", {})
2251-
for font_id, font_info in fonts.items():
2252-
font_name = font_info.get("/BaseFont", "").split("+")[-1]
2253-
if font_name in font_names:
2254-
font_ids.append(font_id)
2255-
2256-
text_filters = {
2257-
"font_ids": font_ids,
2258-
}
2250+
resource_ids_to_remove = []
2251+
2252+
# Content streams reference fonts and other resources with names like "/F1" or "/T1_0"
2253+
# Font names need to be converted to resource names/IDs for easier removal
2254+
if font_names:
2255+
# Recursively loop through page objects to gather font info
2256+
def get_font_info(
2257+
obj: Any,
2258+
font_info: Optional[Dict[str, Any]] = None,
2259+
key: Optional[str] = None
2260+
) -> Dict[str, Any]:
2261+
if font_info is None:
2262+
font_info = {}
2263+
if isinstance(obj, IndirectObject):
2264+
obj = obj.get_object()
2265+
if isinstance(obj, dict):
2266+
if obj.get("/Type") == "/Font":
2267+
font_name = obj.get("/BaseFont", "")
2268+
# Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold"
2269+
normalized_font_name = font_name.lstrip("/").split("+")[-1]
2270+
if normalized_font_name not in font_info:
2271+
font_info[normalized_font_name] = {
2272+
"normalized_font_name": normalized_font_name,
2273+
"resource_ids": [],
2274+
}
2275+
if key not in font_info[normalized_font_name]["resource_ids"]:
2276+
font_info[normalized_font_name]["resource_ids"].append(key)
2277+
for k in obj:
2278+
font_info = get_font_info(obj[k], font_info, k)
2279+
elif isinstance(obj, (list, ArrayObject)):
2280+
for child_obj in obj:
2281+
font_info = get_font_info(child_obj, font_info)
2282+
return font_info
2283+
2284+
# Add relevant resource names for removal
2285+
font_info = get_font_info(page.get("/Resources"))
2286+
for font_name in font_names:
2287+
if font_name in font_info:
2288+
resource_ids_to_remove.extend(font_info[font_name]["resource_ids"])
2289+
2290+
text_filters = {}
2291+
if font_names:
2292+
text_filters["font_ids"] = resource_ids_to_remove
22592293
self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)
22602294

22612295
def add_uri(

tests/test_writer.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1386,22 +1386,40 @@ def test_new_removes():
13861386
assert b"Chap" not in bb
13871387
assert b" TJ" not in bb
13881388

1389+
# Test removing text in a specified font
13891390
writer = PdfWriter()
13901391
writer.clone_document_from_reader(reader)
13911392
b = BytesIO()
13921393
writer.write(b)
1393-
reader = PdfReader(b)
1394-
text = reader.pages[0].extract_text()
1394+
temp_reader = PdfReader(b)
1395+
text = temp_reader.pages[0].extract_text()
13951396
assert "Arbeitsschritt" in text
13961397
assert "Modelltechnik" in text
13971398
writer.remove_text(font_names=["LiberationSans-Bold"])
13981399
b = BytesIO()
13991400
writer.write(b)
1400-
reader = PdfReader(b)
1401-
text = reader.pages[0].extract_text()
1401+
temp_reader = PdfReader(b)
1402+
text = temp_reader.pages[0].extract_text()
14021403
assert "Arbeitsschritt" not in text
14031404
assert "Modelltechnik" in text
14041405

1406+
# Test removing text in a specified font that doesn't exist (nothing should happen)
1407+
writer = PdfWriter()
1408+
writer.clone_document_from_reader(reader)
1409+
b = BytesIO()
1410+
writer.write(b)
1411+
temp_reader = PdfReader(b)
1412+
text = temp_reader.pages[0].extract_text()
1413+
assert "Arbeitsschritt" in text
1414+
assert "Modelltechnik" in text
1415+
writer.remove_text(font_names=["ComicSans-Oblique"])
1416+
b = BytesIO()
1417+
writer.write(b)
1418+
temp_reader = PdfReader(b)
1419+
text = temp_reader.pages[0].extract_text()
1420+
assert "Arbeitsschritt" in text
1421+
assert "Modelltechnik" in text
1422+
14051423
url = "https://github.com/py-pdf/pypdf/files/10832029/tt2.pdf"
14061424
name = "GeoBaseWithComments.pdf"
14071425
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))

0 commit comments

Comments
 (0)