-
Notifications
You must be signed in to change notification settings - Fork 10
/
unsplashDl.py
81 lines (62 loc) · 2.91 KB
/
unsplashDl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
'''
Chapter 12 Web Scraping
Image Site Downloader
Write a program that goes to a photo-sharing site like Flickr or Imgur,
searches for a category of photos, and then downloads all the resulting
images. You could write a program that works with any photo site that
has a search feature.
'''
# unsplashDl.py - Downloads all images on unsplash.com which are
# tagged with the searchterm
# USAGE: python unsplashDl.py searchterm
from selenium import webdriver
import logging, time, requests, re, sys
UNSPLASHSEARCHURL = 'https://unsplash.com/s/photos/'
UNSPLASHDOWNLOADURLSUFFIX = '/download?force=true'
SCROLL_PAUSE_TIME = 1
FILENAMEREGEX = r'(&dl=)(.*?)$'
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Selenium Loggers are really spammy, so their level is set to ERROR
urllib3Logger = logging.getLogger('urllib3.connectionpool')
urllib3Logger.setLevel(logging.ERROR)
seleniumRemoteLogger = logging.getLogger('selenium.webdriver.remote.remote_connection')
seleniumRemoteLogger.setLevel(logging.ERROR)
if(len(sys.argv) > 1):
searchTerm = '-'.join(sys.argv[1:])
searchURL = UNSPLASHSEARCHURL + searchTerm
logging.debug(searchURL)
last_height = 0
with webdriver.Firefox() as browser:
browser.get(searchURL)
time.sleep(2)
# Unsplash has a endless scrolling, to download ALL images you have to
# scroll to the true bottom of the page.
while True:
# Scroll down to bottom
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = browser.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# all photos have a "a"-attribute named itemprop
photoLinkElements = browser.find_elements_by_css_selector("a[itemprop=contentUrl]")
logging.debug(f'found {len(photoLinkElements)} photos for {searchTerm}')
for photoLinkElement in photoLinkElements:
photoDeepLink = f'{photoLinkElement.get_attribute("href")}{UNSPLASHDOWNLOADURLSUFFIX}'
logging.info(f'downloading {photoDeepLink}')
res = requests.get(f'{photoDeepLink}')
filenameMo = re.search(FILENAMEREGEX, res.url)
if(filenameMo is None):
logging.warning('failed to extract filename, skipping file')
continue
else:
photoFilename = filenameMo.group(2)
logging.debug(f'writing {photoFilename}')
with open(photoFilename, mode='wb') as photoFile:
for chunk in res.iter_content(100000):
photoFile.write(chunk)
else:
print('USAGE: python unsplashDl.py searchterm')