forked from kovidgoyal/calibre
-
Notifications
You must be signed in to change notification settings - Fork 0
/
atlantic_com.recipe
224 lines (197 loc) · 9.07 KB
/
atlantic_com.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
import json
from xml.sax.saxutils import escape, quoteattr
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes as prefix_classes, classes
web_version = True
test_article = None
# test_article = 'https://www.theatlantic.com/health/archive/2020/12/covid-19-second-surge/617415/?utm_source=feed'
# {{{ parse article JSON
def process_image_block(lines, block):
caption = block.get('captionText')
caption_lines = []
if caption:
if block.get('attributionText', '').strip():
caption += ' (' + block['attributionText'] + ')'
caption_lines.append('<p style="font-style: italic">' + caption + '</p>')
lines.append('<div style="text-align: center"><img src={}/>'.format(quoteattr(block['url'])))
lines.extend(caption_lines)
lines.append('</div>')
def json_to_html(raw):
data = json.loads(raw)
# open('/t/p.json', 'w').write(json.dumps(data, indent=2))
data = sorted((v['data'] for v in data['props']['pageProps']['urqlState'].values()), key=len)[-1]
article = json.loads(data)['article']
lines = []
lines.append('<h1 style="align: center">' + escape(article['title']) + '</h1>')
lines.append('<h2 style="align: center">' + escape(article['dek']) + '</h2>')
auts = ', '.join(x['displayName'] for x in article['authors'])
if auts:
lines.append('<p style="align: center">by ' + escape(auts) + '</p>')
if article.get('leadArt') and 'image' in article['leadArt']:
process_image_block(lines, article['leadArt']['image'])
for item in article['content']:
tn = item.get('__typename', '')
if tn.endswith('Image'):
process_image_block(lines, item)
continue
html = item.get('innerHtml')
if html is None or '</iframe>' in html:
continue
if 'innerHtml' not in item:
continue
tagname = item.get('tagName', 'P').lower()
lines.append('<{0}>{1}</{0}>'.format(tagname, html))
return '<html><body><div id="from-json-by-calibre">' + '\n'.join(lines) + '</div></body></html>'
class NoJSON(ValueError):
pass
def extract_html(soup):
script = soup.findAll('script', id='__NEXT_DATA__')
if not script:
raise NoJSON('No script tag with JSON data found')
raw = script[0].contents[0]
return json_to_html(raw)
# }}}
class TheAtlantic(BasicNewsRecipe):
if web_version:
title = 'TheAtlantic.com'
description = 'News and editorial about politics, culture, entertainment, tech, etc. Contains many articles not seen in The Atlantic magazine'
else:
title = 'The Atlantic'
description = 'Current affairs and politics focussed on the US'
INDEX = 'https://www.theatlantic.com/magazine/'
__author__ = 'Kovid Goyal'
language = 'en'
encoding = 'utf-8'
keep_only_tags = [
dict(itemprop=['headline']),
classes(
'c-article-header__hed c-rubric article-header c-article-meta c-lead-media'
' lead-img article-cover-extra article-body article-magazine article-cover-content'
),
prefix_classes(
'ArticleHeader_root__ ArticleLayoutSection_main__ ArticleBody_root__'
),
dict(itemprop='articleBody'),
# these are for photos articles
dict(id=['article-header', 'from-json-by-calibre']),
classes('photos'),
]
remove_tags = [
classes(
'c-ad c-share-social c-recirculation-link social-kit-top letter-writer-info callout secondary-byline embed-wrapper'
' offset-wrapper boxtop-most-popular social-icons hints read-more c-article-writer__social'
),
prefix_classes('ArticleRecirc_inline__'),
{
'name': ['meta', 'link', 'noscript', 'aside', 'h3']
},
{
'attrs': {
'class': ['offset-wrapper', 'boxtop-most-popular']
}
},
{
'attrs': {
'class': lambda x: x and 'article-tools' in x
}
},
{
'src': lambda x: x and 'spotxchange.com' in x
},
]
remove_tags_after = classes('article-body')
no_stylesheets = True
remove_attributes = ['style']
extra_css = '''
.credit { text-align: right; font-size: 75%; display: block }
.figcaption { font-size: 75% }
.caption { font-size: 75% }
.lead-img { display: block }
p.dropcap:first-letter {
float: left; text-transform: uppercase; font-weight: bold; font-size: 5.55em; line-height: 0.83;
margin: 0; padding-right: 7px; margin-bottom: -2px; text-align: center;
}
'''
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
br.set_cookie('inEuropeanUnion', '0', '.theatlantic.com')
return br
def preprocess_raw_html(self, raw_html, url):
try:
return extract_html(self.index_to_soup(raw_html))
except NoJSON:
self.log.warn('No JSON found in: {} falling back to HTML'.format(url))
except Exception:
self.log.exception('Failed to extract JSON data from: {} falling back to HTML'.format(url))
return raw_html
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-srcset': True}):
# img['src'] = img['data-srcset'].split()[0]
data_srcset = img['data-srcset']
if ',' in data_srcset:
img['src'] = data_srcset.split(',')[0]
else:
img['src'] = data_srcset.split()[0]
for img in soup.findAll('img', attrs={'data-src': True}):
img['src'] = img['data-src']
return soup
def print_version(self, url):
ans = url.partition('?')[0] + '?single_page=true'
if '/video/' in ans:
ans = None
return ans
if web_version and not test_article:
use_embedded_content = False
feeds = [
('The Atlantic', 'https://www.theatlantic.com/feed/all/'),
('Best of The Atlantic', 'https://www.theatlantic.com/feed/best-of/'),
('Politics | The Atlantic', 'https://www.theatlantic.com/feed/channel/politics/'),
('Business | The Atlantic', 'https://www.theatlantic.com/feed/channel/business/'),
('Culture | The Atlantic', 'https://www.theatlantic.com/feed/channel/entertainment/'),
('Global | The Atlantic', 'https://www.theatlantic.com/feed/channel/international/'),
('Technology | The Atlantic', 'https://www.theatlantic.com/feed/channel/technology/'),
('U.S. | The Atlantic', 'https://www.theatlantic.com/feed/channel/national/'),
('Health | The Atlantic', 'https://www.theatlantic.com/feed/channel/health/'),
('Video | The Atlantic', 'https://www.theatlantic.com/feed/channel/video/'),
('Sexes | The Atlantic', 'https://www.theatlantic.com/feed/channel/sexes/'),
('Education | The Atlantic', 'https://www.theatlantic.com/feed/channel/education/'),
('Science | The Atlantic', 'https://www.theatlantic.com/feed/channel/science/'),
('News | The Atlantic', 'https://www.theatlantic.com/feed/channel/news/'),
('Press Releases | The Atlantic', 'https://www.theatlantic.com/feed/channel/press-releases/'),
('Newsletters | The Atlantic', 'https://www.theatlantic.com/feed/channel/newsletters/'),
('The Atlantic Photo', 'https://feeds.feedburner.com/theatlantic/infocus'),
('Notes | The Atlantic', 'https://feeds.feedburner.com/TheAtlanticNotes'),
]
else:
def parse_index(self):
if test_article:
return [('Articles', [{'title': 'Test article', 'url': test_article}])]
soup = self.index_to_soup(self.INDEX)
img = soup.find(**prefix_classes('IssueDescription_cover__'))
if img is not None:
self.cover_url = img['src']
current_section, current_articles = 'Cover Story', []
feeds = []
for x in soup.findAll(**prefix_classes('TocFeaturedSection_heading__ TocSection_heading__ TocHeroGridItem_hedLink___ TocGridItem_hedLink__')):
cls = x['class']
if not isinstance(cls, str):
cls = ' '.join(cls)
title = self.tag_to_string(x).strip()
if 'Section' in cls:
if current_articles:
feeds.append((current_section, current_articles))
current_section, current_articles = title, []
self.log(current_section)
continue
url = x['href']
current_articles.append({'title': title, 'url': url})
self.log('\t', title, url)
if current_articles:
feeds.append((current_section, current_articles))
return feeds
if __name__ == '__main__':
import sys
from calibre.ebooks.BeautifulSoup import BeautifulSoup
print(extract_html(BeautifulSoup(open(sys.argv[-1]).read())))