Skip to content

Commit 96d4c52

Browse files
authored
feature: Introduce default_encoding parameter to set/autodetect the encoding if the charset is missing from the headers (lexiforest#284)
* Add a `default_encoding` parameter to [set|autodetect] the encoding if no charset is found in the headers * Update github workflow actions/*
1 parent 418e452 commit 96d4c52

File tree

7 files changed

+132
-42
lines changed

7 files changed

+132
-42
lines changed

.github/workflows/build-and-test.yaml

+13-14
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ jobs:
1919
name: Lint
2020
runs-on: ubuntu-latest
2121
steps:
22-
- uses: actions/checkout@v3
23-
- uses: actions/setup-python@v4
22+
- uses: actions/checkout@v4
23+
- uses: actions/setup-python@v5
2424
with:
2525
python-version: '3.10'
2626
- run: |
@@ -31,15 +31,15 @@ jobs:
3131
name: Build sdist wheel
3232
runs-on: ubuntu-latest
3333
steps:
34-
- uses: actions/checkout@v3
34+
- uses: actions/checkout@v4
3535
- run: |
3636
make preprocess
3737
pipx run build --sdist
38-
- uses: actions/upload-artifact@v3
38+
- uses: actions/upload-artifact@v3 # https://github.com/actions/upload-artifact/issues/478
3939
with:
4040
path: ./dist/*.tar.gz
4141

42-
- uses: actions/setup-python@v4
42+
- uses: actions/setup-python@v5
4343
with:
4444
python-version: '3.10'
4545
- run: |
@@ -52,51 +52,50 @@ jobs:
5252
matrix:
5353
os: [ubuntu-22.04, macos-12, macos-14, windows-2019]
5454
steps:
55-
- uses: actions/checkout@v3
55+
- uses: actions/checkout@v4
5656

57-
- uses: actions/setup-python@v4
57+
- uses: actions/setup-python@v5
5858
with:
5959
python-version: '3.10'
6060

6161
- if: runner.os == 'Linux'
62-
uses: docker/setup-qemu-action@v2
62+
uses: docker/setup-qemu-action@v3
6363
with:
6464
platforms: all
6565

6666
# macOS make is too old
6767
- if: runner.os == 'macOS'
6868
run: |
6969
brew install make automake libtool
70-
which pipx || brew install pipx && pipx ensurepath
7170
7271
- name: Build and test wheels
73-
uses: pypa/cibuildwheel@v2.16.5
72+
uses: pypa/cibuildwheel@v2.17.0
7473

7574
# - name: Setup tmate session
7675
# uses: mxschmitt/action-tmate@v3
7776

78-
- uses: actions/upload-artifact@v3
77+
- uses: actions/upload-artifact@v3 # https://github.com/actions/upload-artifact/issues/478
7978
with:
8079
path: ./wheelhouse/*.whl
8180

8281
upload_all:
8382
needs: [bdist, sdist]
8483
runs-on: ubuntu-latest
8584
steps:
86-
- uses: actions/download-artifact@v3
85+
- uses: actions/download-artifact@v3 # https://github.com/actions/upload-artifact/issues/478
8786
if: startsWith(github.ref, 'refs/tags/')
8887
with:
8988
name: artifact
9089
path: dist
9190

92-
- uses: pypa/gh-action-pypi-publish@v1.5.0
91+
- uses: pypa/gh-action-pypi-publish@v1.8.14
9392
if: startsWith(github.ref, 'refs/tags/')
9493
with:
9594
password: ${{ secrets.PYPI_TOKEN }}
9695

9796
- name: Upload release files
9897
if: startsWith(github.ref, 'refs/tags/')
99-
uses: softprops/action-gh-release@v1
98+
uses: softprops/action-gh-release@v2
10099
with:
101100
files: |
102101
./dist/*.whl

curl_cffi/requests/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def request(
5858
impersonate: Optional[Union[str, BrowserType]] = None,
5959
thread: Optional[ThreadType] = None,
6060
default_headers: Optional[bool] = None,
61+
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
6162
curl_options: Optional[dict] = None,
6263
http_version: Optional[CurlHttpVersion] = None,
6364
debug: bool = False,
@@ -90,6 +91,8 @@ def request(
9091
impersonate: which browser version to impersonate.
9192
thread: work with other thread implementations. choices: eventlet, gevent.
9293
default_headers: whether to set default browser headers.
94+
default_encoding: encoding for decoding response content if charset is not found in headers.
95+
Defaults to "utf-8". Can be set to a callable for automatic detection.
9396
curl_options: extra curl options to use.
9497
http_version: limiting http version, http2 will be tries by default.
9598
debug: print extra curl debug info.
@@ -122,6 +125,7 @@ def request(
122125
content_callback=content_callback,
123126
impersonate=impersonate,
124127
default_headers=default_headers,
128+
default_encoding=default_encoding,
125129
http_version=http_version,
126130
interface=interface,
127131
multipart=multipart,

curl_cffi/requests/models.py

+59-8
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
import queue
2+
import re
23
import warnings
34
from concurrent.futures import Future
5+
from functools import cached_property
46
from json import loads
5-
from typing import Any, Awaitable, Dict, List, Optional
7+
from typing import Any, Awaitable, Callable, Dict, List, Optional, Union
68

79
from .. import Curl
810
from .cookies import Cookies
911
from .errors import RequestsError
1012
from .headers import Headers
1113

14+
CHARSET_RE = re.compile(r"charset=([\w-]+)")
15+
1216

1317
def clear_queue(q: queue.Queue):
1418
with q.mutex:
@@ -41,6 +45,8 @@ class Response:
4145
elapsed: how many seconds the request cost.
4246
encoding: http body encoding.
4347
charset: alias for encoding.
48+
charset_encoding: encoding specified by the Content-Type header.
49+
default_encoding: user-defined encoding used for decoding content if charset is not found in headers.
4450
redirect_count: how many redirects happened.
4551
redirect_url: the final redirected url.
4652
http_version: http version used.
@@ -58,8 +64,7 @@ def __init__(self, curl: Optional[Curl] = None, request: Optional[Request] = Non
5864
self.headers = Headers()
5965
self.cookies = Cookies()
6066
self.elapsed = 0.0
61-
self.encoding = "utf-8"
62-
self.charset = self.encoding
67+
self.default_encoding: Union[str, Callable[[bytes], str]] = "utf-8"
6368
self.redirect_count = 0
6469
self.redirect_url = ""
6570
self.http_version = 0
@@ -70,16 +75,62 @@ def __init__(self, curl: Optional[Curl] = None, request: Optional[Request] = Non
7075
self.astream_task: Optional[Awaitable] = None
7176
self.quit_now = None
7277

78+
@property
79+
def charset(self) -> str:
80+
"""Alias for encoding."""
81+
return self.encoding
82+
83+
@property
84+
def encoding(self) -> str:
85+
"""
86+
Determines the encoding to decode byte content into text.
87+
88+
The method follows a specific priority to decide the encoding:
89+
1. If `.encoding` has been explicitly set, it is used.
90+
2. The encoding specified by the `charset` parameter in the `Content-Type` header.
91+
3. The encoding specified by the `default_encoding` attribute. This can either be
92+
a string (e.g., "utf-8") or a callable for charset autodetection.
93+
"""
94+
if not hasattr(self, "_encoding"):
95+
encoding = self.charset_encoding
96+
if encoding is None:
97+
if isinstance(self.default_encoding, str):
98+
encoding = self.default_encoding
99+
elif callable(self.default_encoding):
100+
encoding = self.default_encoding(self.content)
101+
self._encoding = encoding or "utf-8"
102+
return self._encoding
103+
104+
@encoding.setter
105+
def encoding(self, value: str) -> None:
106+
if hasattr(self, "_text"):
107+
raise ValueError("Cannot set encoding after text has been accessed")
108+
self._encoding = value
109+
110+
@property
111+
def charset_encoding(self) -> Optional[str]:
112+
"""Return the encoding, as specified by the Content-Type header."""
113+
content_type = self.headers.get("Content-Type")
114+
if content_type:
115+
charset_match = CHARSET_RE.search(content_type)
116+
return charset_match.group(1) if charset_match else None
117+
return None
118+
119+
@property
120+
def text(self) -> str:
121+
if not hasattr(self, "_text"):
122+
if not self.content:
123+
self._text = ""
124+
else:
125+
self._text = self._decode(self.content)
126+
return self._text
127+
73128
def _decode(self, content: bytes) -> str:
74129
try:
75-
return content.decode(self.charset, errors="replace")
130+
return content.decode(self.encoding, errors="replace")
76131
except (UnicodeDecodeError, LookupError):
77132
return content.decode("utf-8-sig")
78133

79-
@property
80-
def text(self) -> str:
81-
return self._decode(self.content)
82-
83134
def raise_for_status(self):
84135
"""Raise an error if status code is not in [200, 400)"""
85136
if not self.ok:

curl_cffi/requests/session.py

+18-18
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import asyncio
22
import math
33
import queue
4-
import re
54
import threading
65
import warnings
76
from concurrent.futures import ThreadPoolExecutor
@@ -55,7 +54,6 @@ class ProxySpec(TypedDict, total=False):
5554
else:
5655
ProxySpec = Dict[str, str]
5756

58-
CHARSET_RE = re.compile(r"charset=([\w-]+)")
5957
ThreadType = Literal["eventlet", "gevent"]
6058

6159

@@ -205,6 +203,7 @@ def __init__(
205203
max_redirects: int = -1,
206204
impersonate: Optional[Union[str, BrowserType]] = None,
207205
default_headers: bool = True,
206+
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
208207
curl_options: Optional[dict] = None,
209208
curl_infos: Optional[list] = None,
210209
http_version: Optional[CurlHttpVersion] = None,
@@ -224,6 +223,7 @@ def __init__(
224223
self.max_redirects = max_redirects
225224
self.impersonate = impersonate
226225
self.default_headers = default_headers
226+
self.default_encoding = default_encoding
227227
self.curl_options = curl_options or {}
228228
self.curl_infos = curl_infos or []
229229
self.http_version = http_version
@@ -547,7 +547,7 @@ def qput(chunk):
547547

548548
return req, buffer, header_buffer, q, header_recved, quit_now
549549

550-
def _parse_response(self, curl, buffer, header_buffer):
550+
def _parse_response(self, curl, buffer, header_buffer, default_encoding):
551551
c = curl
552552
rsp = Response(c)
553553
rsp.url = cast(bytes, c.getinfo(CurlInfo.EFFECTIVE_URL)).decode()
@@ -583,13 +583,7 @@ def _parse_response(self, curl, buffer, header_buffer):
583583
rsp.cookies = self.cookies
584584
# print("Cookies after extraction", self.cookies)
585585

586-
content_type = rsp.headers.get("Content-Type", default="")
587-
charset_match = CHARSET_RE.search(content_type)
588-
charset = charset_match.group(1) if charset_match else "utf-8"
589-
590-
rsp.charset = charset
591-
rsp.encoding = charset # TODO use chardet
592-
586+
rsp.default_encoding = default_encoding
593587
rsp.elapsed = cast(float, c.getinfo(CurlInfo.TOTAL_TIME))
594588
rsp.redirect_count = cast(int, c.getinfo(CurlInfo.REDIRECT_COUNT))
595589
rsp.redirect_url = cast(bytes, c.getinfo(CurlInfo.REDIRECT_URL)).decode()
@@ -639,6 +633,8 @@ def __init__(
639633
max_redirects: max redirect counts, default unlimited(-1).
640634
impersonate: which browser version to impersonate in the session.
641635
interface: which interface use in request to server.
636+
default_encoding: encoding for decoding response content if charset is not found in headers.
637+
Defaults to "utf-8". Can be set to a callable for automatic detection.
642638
643639
Notes:
644640
This class can be used as a context manager.
@@ -767,6 +763,7 @@ def request(
767763
content_callback: Optional[Callable] = None,
768764
impersonate: Optional[Union[str, BrowserType]] = None,
769765
default_headers: Optional[bool] = None,
766+
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
770767
http_version: Optional[CurlHttpVersion] = None,
771768
interface: Optional[str] = None,
772769
cert: Optional[Union[str, Tuple[str, str]]] = None,
@@ -825,7 +822,7 @@ def perform():
825822
try:
826823
c.perform()
827824
except CurlError as e:
828-
rsp = self._parse_response(c, buffer, header_buffer)
825+
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
829826
rsp.request = req
830827
cast(queue.Queue, q).put_nowait(RequestsError(str(e), e.code, rsp))
831828
finally:
@@ -843,7 +840,7 @@ def cleanup(fut):
843840

844841
# Wait for the first chunk
845842
cast(threading.Event, header_recved).wait()
846-
rsp = self._parse_response(c, buffer, header_buffer)
843+
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
847844
header_parsed.set()
848845

849846
# Raise the exception if something wrong happens when receiving the header.
@@ -868,11 +865,11 @@ def cleanup(fut):
868865
else:
869866
c.perform()
870867
except CurlError as e:
871-
rsp = self._parse_response(c, buffer, header_buffer)
868+
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
872869
rsp.request = req
873870
raise RequestsError(str(e), e.code, rsp) from e
874871
else:
875-
rsp = self._parse_response(c, buffer, header_buffer)
872+
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
876873
rsp.request = req
877874
return rsp
878875
finally:
@@ -919,6 +916,8 @@ def __init__(
919916
allow_redirects: whether to allow redirection.
920917
max_redirects: max redirect counts, default unlimited(-1).
921918
impersonate: which browser version to impersonate in the session.
919+
default_encoding: encoding for decoding response content if charset is not found in headers.
920+
Defaults to "utf-8". Can be set to a callable for automatic detection.
922921
923922
Notes:
924923
This class can be used as a context manager, and it's recommended to use via
@@ -1043,6 +1042,7 @@ async def request(
10431042
content_callback: Optional[Callable] = None,
10441043
impersonate: Optional[Union[str, BrowserType]] = None,
10451044
default_headers: Optional[bool] = None,
1045+
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
10461046
http_version: Optional[CurlHttpVersion] = None,
10471047
interface: Optional[str] = None,
10481048
cert: Optional[Union[str, Tuple[str, str]]] = None,
@@ -1093,7 +1093,7 @@ async def perform():
10931093
try:
10941094
await task
10951095
except CurlError as e:
1096-
rsp = self._parse_response(curl, buffer, header_buffer)
1096+
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
10971097
rsp.request = req
10981098
cast(asyncio.Queue, q).put_nowait(RequestsError(str(e), e.code, rsp))
10991099
finally:
@@ -1113,7 +1113,7 @@ def cleanup(fut):
11131113
# Unlike threads, coroutines does not use preemptive scheduling.
11141114
# For asyncio, there is no need for a header_parsed event, the
11151115
# _parse_response will execute in the foreground, no background tasks running.
1116-
rsp = self._parse_response(curl, buffer, header_buffer)
1116+
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
11171117

11181118
first_element = _peek_aio_queue(cast(asyncio.Queue, q))
11191119
if isinstance(first_element, RequestsError):
@@ -1132,11 +1132,11 @@ def cleanup(fut):
11321132
await task
11331133
# print(curl.getinfo(CurlInfo.CAINFO))
11341134
except CurlError as e:
1135-
rsp = self._parse_response(curl, buffer, header_buffer)
1135+
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
11361136
rsp.request = req
11371137
raise RequestsError(str(e), e.code, rsp) from e
11381138
else:
1139-
rsp = self._parse_response(curl, buffer, header_buffer)
1139+
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
11401140
rsp.request = req
11411141
return rsp
11421142
finally:

0 commit comments

Comments
 (0)