diff --git a/.gitignore b/.gitignore index af3a05726..dcc8cbb64 100644 --- a/.gitignore +++ b/.gitignore @@ -18,10 +18,6 @@ develop-eggs # Installer logs pip-log.txt -# URL logs -urlin.txt -urlout.txt - # Unit test / coverage reports .coverage .tox diff --git a/check_urls.py b/check_urls.py index c0a9e9018..918f1f937 100644 --- a/check_urls.py +++ b/check_urls.py @@ -2,8 +2,11 @@ from concurrent import futures import multiprocessing as mp import os +import json import uuid +from bs4 import BeautifulSoup +from markdown import markdown import requests import urllib3 @@ -12,19 +15,36 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # Avoid rate limiting (tcp) -_URL_BOT_ID = 'Bot {id}'.format(id=str(uuid.uuid4())) -URL_HEADERS = {'User-Agent': _URL_BOT_ID} -URL_TIMEOUT = 10.0 - -# Sources of data (file) -IN_PATH = os.path.join(os.getcwd(), 'urlin.txt') -OUT_PATH = os.path.join(os.getcwd(), 'urlout.txt') - -# Collect repository URLs (bash) -_URL_RE = 'https?:\/\/[=a-zA-Z0-9\_\/\?\&\.\-]+' # proto://host+path+params -_FIND_URLS = "find . -type f | xargs grep -hEo '{regex}'".format(regex=_URL_RE) -_FILTER_URLS = "sed '/Binary/d' | sort | uniq > {urlin}".format(urlin=IN_PATH) -COMMAND = '{find} | {filter}'.format(find=_FIND_URLS, filter=_FILTER_URLS) +URL_BOT_ID = f'Bot {str(uuid.uuid4())}' + + +def extract_urls_from_html(content, all_urls): + soup = BeautifulSoup(content, 'html.parser') + for a in soup.find_all('a', href=True): + url = a['href'] + if url.startswith('http'): + all_urls.add(url) + + +def extract_urls(discover_path): + exclude = ['.git', '.vscode'] + all_urls = set() + max_strlen = -1 + for root, dirs, files in os.walk(discover_path, topdown=True): + dirs[:] = [d for d in dirs if d not in exclude] + for file in files: + output = f'Currently checking: file={file}' + file_path = os.path.join(root, file) + if max_strlen < len(output): + max_strlen = len(output) + print(output.ljust(max_strlen), end='\r') + if file_path.endswith('.html'): + content = open(file_path) + extract_urls_from_html(content, all_urls) + elif file_path.endswith('.markdown'): + content = markdown(open(file_path).read()) + extract_urls_from_html(content, all_urls) + return all_urls def run_workers(work, data, worker_threads=mp.cpu_count()*4): @@ -42,13 +62,15 @@ def get_url_status(url): clean_url = url.strip('?.') try: response = requests.get( - clean_url, verify=False, timeout=URL_TIMEOUT, - headers=URL_HEADERS) + clean_url, verify=False, timeout=10.0, + headers={'User-Agent': URL_BOT_ID}) return (clean_url, response.status_code) except requests.exceptions.Timeout: return (clean_url, 504) except requests.exceptions.ConnectionError: return (clean_url, -1) + except requests.exceptions.TooManyRedirects: + return (clean_url, -1) def bad_url(url_status): @@ -65,22 +87,20 @@ def bad_url(url_status): def main(): print('Extract urls...') - os.system(COMMAND) - with open(IN_PATH, 'r') as fr: - urls = map(lambda l: l.strip('\n'), fr.readlines()) - with open(OUT_PATH, 'w') as fw: - url_id = 1 - max_strlen = -1 - for url_path, url_status in run_workers(get_url_status, urls): - output = 'Currently checking: id={uid} host={uhost}'.format( - uid=url_id, uhost=urllib3.util.parse_url(url_path).host) - if max_strlen < len(output): - max_strlen = len(output) - print(output.ljust(max_strlen), end='\r') - if bad_url(url_status) is True: - fw.write('{}: {}\n'.format(url_path, url_status)) - url_id += 1 - print('\nDone.') + all_urls = extract_urls(os.getcwd()) + print('\nCheck urls...') + bad_urls = {} + url_id = 1 + max_strlen = -1 + for url_path, url_status in run_workers(get_url_status, all_urls): + output = f'Currently checking: id={url_id} host={urllib3.util.parse_url(url_path).host}' + if max_strlen < len(output): + max_strlen = len(output) + print(output.ljust(max_strlen), end='\r') + if bad_url(url_status) is True: + bad_urls[url_path] = url_status + url_id += 1 + print(f'\nBad urls: {json.dumps(bad_urls, indent=4)}') if __name__ == '__main__':