1
1
import asyncio
2
2
import math
3
3
import queue
4
- import re
5
4
import threading
6
5
import warnings
7
6
from concurrent .futures import ThreadPoolExecutor
@@ -55,7 +54,6 @@ class ProxySpec(TypedDict, total=False):
55
54
else :
56
55
ProxySpec = Dict [str , str ]
57
56
58
- CHARSET_RE = re .compile (r"charset=([\w-]+)" )
59
57
ThreadType = Literal ["eventlet" , "gevent" ]
60
58
61
59
@@ -205,6 +203,7 @@ def __init__(
205
203
max_redirects : int = - 1 ,
206
204
impersonate : Optional [Union [str , BrowserType ]] = None ,
207
205
default_headers : bool = True ,
206
+ default_encoding : Union [str , Callable [[bytes ], str ]] = "utf-8" ,
208
207
curl_options : Optional [dict ] = None ,
209
208
curl_infos : Optional [list ] = None ,
210
209
http_version : Optional [CurlHttpVersion ] = None ,
@@ -224,6 +223,7 @@ def __init__(
224
223
self .max_redirects = max_redirects
225
224
self .impersonate = impersonate
226
225
self .default_headers = default_headers
226
+ self .default_encoding = default_encoding
227
227
self .curl_options = curl_options or {}
228
228
self .curl_infos = curl_infos or []
229
229
self .http_version = http_version
@@ -547,7 +547,7 @@ def qput(chunk):
547
547
548
548
return req , buffer , header_buffer , q , header_recved , quit_now
549
549
550
- def _parse_response (self , curl , buffer , header_buffer ):
550
+ def _parse_response (self , curl , buffer , header_buffer , default_encoding ):
551
551
c = curl
552
552
rsp = Response (c )
553
553
rsp .url = cast (bytes , c .getinfo (CurlInfo .EFFECTIVE_URL )).decode ()
@@ -583,13 +583,7 @@ def _parse_response(self, curl, buffer, header_buffer):
583
583
rsp .cookies = self .cookies
584
584
# print("Cookies after extraction", self.cookies)
585
585
586
- content_type = rsp .headers .get ("Content-Type" , default = "" )
587
- charset_match = CHARSET_RE .search (content_type )
588
- charset = charset_match .group (1 ) if charset_match else "utf-8"
589
-
590
- rsp .charset = charset
591
- rsp .encoding = charset # TODO use chardet
592
-
586
+ rsp .default_encoding = default_encoding
593
587
rsp .elapsed = cast (float , c .getinfo (CurlInfo .TOTAL_TIME ))
594
588
rsp .redirect_count = cast (int , c .getinfo (CurlInfo .REDIRECT_COUNT ))
595
589
rsp .redirect_url = cast (bytes , c .getinfo (CurlInfo .REDIRECT_URL )).decode ()
@@ -639,6 +633,8 @@ def __init__(
639
633
max_redirects: max redirect counts, default unlimited(-1).
640
634
impersonate: which browser version to impersonate in the session.
641
635
interface: which interface use in request to server.
636
+ default_encoding: encoding for decoding response content if charset is not found in headers.
637
+ Defaults to "utf-8". Can be set to a callable for automatic detection.
642
638
643
639
Notes:
644
640
This class can be used as a context manager.
@@ -767,6 +763,7 @@ def request(
767
763
content_callback : Optional [Callable ] = None ,
768
764
impersonate : Optional [Union [str , BrowserType ]] = None ,
769
765
default_headers : Optional [bool ] = None ,
766
+ default_encoding : Union [str , Callable [[bytes ], str ]] = "utf-8" ,
770
767
http_version : Optional [CurlHttpVersion ] = None ,
771
768
interface : Optional [str ] = None ,
772
769
cert : Optional [Union [str , Tuple [str , str ]]] = None ,
@@ -825,7 +822,7 @@ def perform():
825
822
try :
826
823
c .perform ()
827
824
except CurlError as e :
828
- rsp = self ._parse_response (c , buffer , header_buffer )
825
+ rsp = self ._parse_response (c , buffer , header_buffer , default_encoding )
829
826
rsp .request = req
830
827
cast (queue .Queue , q ).put_nowait (RequestsError (str (e ), e .code , rsp ))
831
828
finally :
@@ -843,7 +840,7 @@ def cleanup(fut):
843
840
844
841
# Wait for the first chunk
845
842
cast (threading .Event , header_recved ).wait ()
846
- rsp = self ._parse_response (c , buffer , header_buffer )
843
+ rsp = self ._parse_response (c , buffer , header_buffer , default_encoding )
847
844
header_parsed .set ()
848
845
849
846
# Raise the exception if something wrong happens when receiving the header.
@@ -868,11 +865,11 @@ def cleanup(fut):
868
865
else :
869
866
c .perform ()
870
867
except CurlError as e :
871
- rsp = self ._parse_response (c , buffer , header_buffer )
868
+ rsp = self ._parse_response (c , buffer , header_buffer , default_encoding )
872
869
rsp .request = req
873
870
raise RequestsError (str (e ), e .code , rsp ) from e
874
871
else :
875
- rsp = self ._parse_response (c , buffer , header_buffer )
872
+ rsp = self ._parse_response (c , buffer , header_buffer , default_encoding )
876
873
rsp .request = req
877
874
return rsp
878
875
finally :
@@ -919,6 +916,8 @@ def __init__(
919
916
allow_redirects: whether to allow redirection.
920
917
max_redirects: max redirect counts, default unlimited(-1).
921
918
impersonate: which browser version to impersonate in the session.
919
+ default_encoding: encoding for decoding response content if charset is not found in headers.
920
+ Defaults to "utf-8". Can be set to a callable for automatic detection.
922
921
923
922
Notes:
924
923
This class can be used as a context manager, and it's recommended to use via
@@ -1043,6 +1042,7 @@ async def request(
1043
1042
content_callback : Optional [Callable ] = None ,
1044
1043
impersonate : Optional [Union [str , BrowserType ]] = None ,
1045
1044
default_headers : Optional [bool ] = None ,
1045
+ default_encoding : Union [str , Callable [[bytes ], str ]] = "utf-8" ,
1046
1046
http_version : Optional [CurlHttpVersion ] = None ,
1047
1047
interface : Optional [str ] = None ,
1048
1048
cert : Optional [Union [str , Tuple [str , str ]]] = None ,
@@ -1093,7 +1093,7 @@ async def perform():
1093
1093
try :
1094
1094
await task
1095
1095
except CurlError as e :
1096
- rsp = self ._parse_response (curl , buffer , header_buffer )
1096
+ rsp = self ._parse_response (curl , buffer , header_buffer , default_encoding )
1097
1097
rsp .request = req
1098
1098
cast (asyncio .Queue , q ).put_nowait (RequestsError (str (e ), e .code , rsp ))
1099
1099
finally :
@@ -1113,7 +1113,7 @@ def cleanup(fut):
1113
1113
# Unlike threads, coroutines does not use preemptive scheduling.
1114
1114
# For asyncio, there is no need for a header_parsed event, the
1115
1115
# _parse_response will execute in the foreground, no background tasks running.
1116
- rsp = self ._parse_response (curl , buffer , header_buffer )
1116
+ rsp = self ._parse_response (curl , buffer , header_buffer , default_encoding )
1117
1117
1118
1118
first_element = _peek_aio_queue (cast (asyncio .Queue , q ))
1119
1119
if isinstance (first_element , RequestsError ):
@@ -1132,11 +1132,11 @@ def cleanup(fut):
1132
1132
await task
1133
1133
# print(curl.getinfo(CurlInfo.CAINFO))
1134
1134
except CurlError as e :
1135
- rsp = self ._parse_response (curl , buffer , header_buffer )
1135
+ rsp = self ._parse_response (curl , buffer , header_buffer , default_encoding )
1136
1136
rsp .request = req
1137
1137
raise RequestsError (str (e ), e .code , rsp ) from e
1138
1138
else :
1139
- rsp = self ._parse_response (curl , buffer , header_buffer )
1139
+ rsp = self ._parse_response (curl , buffer , header_buffer , default_encoding )
1140
1140
rsp .request = req
1141
1141
return rsp
1142
1142
finally :
0 commit comments