Skip to content

Commit

Permalink
feat: Switch to use async emitRequest for lrtfeeder, mtr and parseGtfs
Browse files Browse the repository at this point in the history
  • Loading branch information
rtau committed Aug 1, 2024
1 parent 91d8776 commit d920795
Show file tree
Hide file tree
Showing 4 changed files with 377 additions and 331 deletions.
121 changes: 67 additions & 54 deletions crawling/lrtfeeder.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,75 @@
# -*- coding: utf-8 -*-
# MTR Bus fetching

import asyncio
import csv
import requests
import json

routeList = {}
stopList = {}

r = requests.get('https://opendata.mtr.com.hk/data/mtr_bus_routes.csv')
r.encoding = 'utf-8'
reader = csv.reader(r.text.split("\n") )
headers = next(reader,None)
routes = [route for route in reader if len(route) == 4]
for [route, chn, eng, circular] in routes:
if route == '':
continue
start = {
"zh": chn.split('至')[0],
"en": eng.split(' to ')[0]
}
end = {
"zh": chn.split('至')[1],
"en": eng.split(' to ')[1]
}
for bound in ['I', 'O']:
routeList[route+"_"+bound] = {
"route": route,
"bound": bound,
"service_type": "1",
"orig_tc": start['zh'] if bound == 'O' else end['zh'],
"dest_tc": end["zh"] if bound == 'O' else start['zh'],
"orig_en": start['en'] if bound == 'O' else end['en'],
"dest_en": end["en"] if bound == 'O' else start['en'],
"stops": []
import logging
import httpx

from crawl_utils import emitRequest

async def getRouteStop(co = 'lrtfeeder'):
a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
routeList = {}
stopList = {}

r = await emitRequest('https://opendata.mtr.com.hk/data/mtr_bus_routes.csv', a_client)
r.encoding = 'utf-8'
reader = csv.reader(r.text.split("\n") )
headers = next(reader,None)
routes = [route for route in reader if len(route) == 4]
for [route, chn, eng, circular] in routes:
if route == '':
continue
start = {
"zh": chn.split('至')[0],
"en": eng.split(' to ')[0]
}
end = {
"zh": chn.split('至')[1],
"en": eng.split(' to ')[1]
}
for bound in ['I', 'O']:
routeList[route+"_"+bound] = {
"route": route,
"bound": bound,
"service_type": "1",
"orig_tc": start['zh'] if bound == 'O' else end['zh'],
"dest_tc": end["zh"] if bound == 'O' else start['zh'],
"orig_en": start['en'] if bound == 'O' else end['en'],
"dest_en": end["en"] if bound == 'O' else start['en'],
"stops": []
}

# Parse stops
r = await emitRequest('https://opendata.mtr.com.hk/data/mtr_bus_stops.csv', a_client)
r.encoding = 'utf-8'
reader = csv.reader(r.text.split("\n") )
headers = next(reader,None)
stops = [stop for stop in reader if len(stop) == 8]
for [route, bound, seq, stationId, lat, lng, name_zh, name_en] in stops:
routeKey = route+"_"+bound
if routeKey in routeList:
routeList[routeKey]['stops'].append(stationId)
else:
print ("error", routeKey)
stopList[stationId] = {
"stop": stationId,
"name_en": name_en,
"name_tc": name_zh,
"lat": lat,
"long": lng
}

with open('routeList.lrtfeeder.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps([route for route in routeList.values() if len(route['stops']) > 0], ensure_ascii=False))
with open('stopList.lrtfeeder.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps(stopList, ensure_ascii=False))

# Parse stops
r = requests.get('https://opendata.mtr.com.hk/data/mtr_bus_stops.csv')
r.encoding = 'utf-8'
reader = csv.reader(r.text.split("\n") )
headers = next(reader,None)
stops = [stop for stop in reader if len(stop) == 8]
for [route, bound, seq, stationId, lat, lng, name_zh, name_en] in stops:
routeKey = route+"_"+bound
if routeKey in routeList:
routeList[routeKey]['stops'].append(stationId)
else:
print ("error", routeKey)
stopList[stationId] = {
"stop": stationId,
"name_en": name_en,
"name_tc": name_zh,
"lat": lat,
"long": lng
}

with open('routeList.lrtfeeder.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps([route for route in routeList.values() if len(route['stops']) > 0], ensure_ascii=False))
with open('stopList.lrtfeeder.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps(stopList, ensure_ascii=False))
if __name__=='__main__':
logging.basicConfig(level=logging.INFO)
logging.getLogger('httpx').setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
asyncio.run(getRouteStop())
107 changes: 60 additions & 47 deletions crawling/mtr.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,72 @@
# -*- coding: utf-8 -*-
# MTR Bus fetching

import asyncio
import csv
import requests
import json
from pyproj import Transformer

epsgTransformer = Transformer.from_crs('epsg:2326', 'epsg:4326')

routeList = {}
stopList = {}

r = requests.get('https://opendata.mtr.com.hk/data/mtr_lines_and_stations.csv')
r.encoding = 'utf-8'
reader = csv.reader(r.text.split("\n") )
headers = next(reader,None)
routes = [route for route in reader if len(route) == 7]
for [route, bound, stopCode, stopId, chn, eng, seq] in routes:
if route == "":
continue
if route+"_"+bound not in routeList:
routeList[route+"_"+bound] = {
"gtfsId": None,
"route": route,
"bound": bound,
"service_type": "1",
"orig_tc": None,
"orig_en": None,
"dest_tc": None,
"dest_en": None,
"stops": [None] * 100,
"fare": []
}
if int(float(seq)) == 1:
routeList[route+"_"+bound]["orig_tc"] = chn
routeList[route+"_"+bound]["orig_en"] = eng
routeList[route+"_"+bound]["dest_tc"] = chn
routeList[route+"_"+bound]["dest_en"] = eng
routeList[route+"_"+bound]["stops"][int(float(seq))] = stopCode
if stopCode not in stopList:
r = requests.get('https://geodata.gov.hk/gs/api/v1.0.0/locationSearch?q=港鐵'+chn+"站", headers={'Accept': 'application/json'})
lat, lng = epsgTransformer.transform( r.json()[0]['y'], r.json()[0]['x'] )
stopList[stopCode] = {
"stop": stopCode,
"name_en": eng,
"name_tc": chn,
"lat": lat,
"long": lng
}
import logging
import httpx

from crawl_utils import emitRequest

def filterStops(route):
route['stops'] = [stop for stop in route['stops'] if stop is not None]
return route

with open('routeList.mtr.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps(list(map(filterStops, [route for route in routeList.values() if len(route['stops']) > 0])), ensure_ascii=False))
with open('stopList.mtr.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps(stopList, ensure_ascii=False))
async def getRouteStop(co = 'mtr'):
a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
epsgTransformer = Transformer.from_crs('epsg:2326', 'epsg:4326')

routeList = {}
stopList = {}

r = await emitRequest('https://opendata.mtr.com.hk/data/mtr_lines_and_stations.csv', a_client)
r.encoding = 'utf-8'
reader = csv.reader(r.text.split("\n") )
headers = next(reader,None)
routes = [route for route in reader if len(route) == 7]
for [route, bound, stopCode, stopId, chn, eng, seq] in routes:
if route == "":
continue
if route+"_"+bound not in routeList:
routeList[route+"_"+bound] = {
"gtfsId": None,
"route": route,
"bound": bound,
"service_type": "1",
"orig_tc": None,
"orig_en": None,
"dest_tc": None,
"dest_en": None,
"stops": [None] * 100,
"fare": []
}
if int(float(seq)) == 1:
routeList[route+"_"+bound]["orig_tc"] = chn
routeList[route+"_"+bound]["orig_en"] = eng
routeList[route+"_"+bound]["dest_tc"] = chn
routeList[route+"_"+bound]["dest_en"] = eng
routeList[route+"_"+bound]["stops"][int(float(seq))] = stopCode
if stopCode not in stopList:
r = await emitRequest('https://geodata.gov.hk/gs/api/v1.0.0/locationSearch?q=港鐵'+chn+"站", a_client, headers={'Accept': 'application/json'})
lat, lng = epsgTransformer.transform( r.json()[0]['y'], r.json()[0]['x'] )
stopList[stopCode] = {
"stop": stopCode,
"name_en": eng,
"name_tc": chn,
"lat": lat,
"long": lng
}

with open('routeList.mtr.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps(list(map(filterStops, [route for route in routeList.values() if len(route['stops']) > 0])), ensure_ascii=False))
with open('stopList.mtr.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps(stopList, ensure_ascii=False))

if __name__=='__main__':
logging.basicConfig(level=logging.INFO)
logging.getLogger('httpx').setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
asyncio.run(getRouteStop())
Loading

0 comments on commit d920795

Please sign in to comment.