Skip to content

Commit e005877

Browse files
committed
- rename output routeFareList.min.json to routeFareList.mergeRoutes.min.json in previous workflow (mergeRoutes.py)
- add mergeStopList stop groupping to alpha json for testing - update python 3.8 to 3.12 - update pyproj 3.3.0 to 3.6.1 - update outdated actions/checkout@v2 to actions/checkout@v4 - update outdated actions/setup-python@v4 to actions/setup-python@v5 - update outdated actions/upload-artifact@v3 to actions/upload-artifact@v4
1 parent 6cd056f commit e005877

File tree

4 files changed

+277
-24
lines changed

4 files changed

+277
-24
lines changed

Diff for: .github/workflows/fetch-data.yml

+20-6
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ jobs:
1111

1212
steps:
1313
- name: Check out repository code
14-
uses: actions/checkout@v2
14+
uses: actions/checkout@v4
1515
- name: Setup Python environment
16-
uses: actions/setup-python@v4
16+
uses: actions/setup-python@v5
1717
with:
18-
python-version: '3.8'
18+
python-version: '3.12'
1919
architecture: 'x64'
2020
cache: 'pip'
2121
cache-dependency-path: crawling/requirements.txt
@@ -45,11 +45,12 @@ jobs:
4545
python ./crawling/matchGtfs.py
4646
python ./crawling/cleansing.py
4747
python ./crawling/mergeRoutes.py
48+
python ./crawling/mergeStopList.py
4849
python ./crawling/routeCompare.py
4950
python ./crawling/mtrExits.py
5051
5152
- name: Archive crawling outputs
52-
uses: actions/upload-artifact@v3
53+
uses: actions/upload-artifact@v4
5354
if: always()
5455
with:
5556
name: Crawled Files
@@ -59,6 +60,7 @@ jobs:
5960
routeFareList*
6061
routeList*
6162
stopList*
63+
stopMap*
6264
routeTime.json
6365
gtfs
6466
gtfs.json
@@ -68,11 +70,23 @@ jobs:
6870
route-ts/
6971
exits.mtr.json
7072
- name: Update MD5
71-
run: md5sum routeFareList.min.json | cut -f1 -d ' ' | tr -d $'\n' > routeFareList.md5
73+
run: |
74+
md5sum routeFareList.min.json | cut -f1 -d ' ' | tr -d $'\n' > routeFareList.md5
75+
md5sum routeFareList.alpha.min.json | cut -f1 -d ' ' | tr -d $'\n' > routeFareList.alpha.md5
7276
- name: create deployment folder
7377
run: mkdir -p build
7478
- name: cp files into deployment folder
75-
run: cp -r routeFareList.json routeFareList.min.json routeFareList.md5 CNAME exits.mtr.json route-ts build/
79+
run: |
80+
cp \
81+
routeFareList.json \
82+
routeFareList.min.json \
83+
routeFareList.alpha.json \
84+
routeFareList.alpha.min.json \
85+
routeFareList.md5 \
86+
routeFareList.alpha.md5 \
87+
CNAME \
88+
exits.mtr.json \
89+
build/
7690
- name: cp route-ts into deployment folder
7791
run: cp -r route-ts build
7892
- name: Update resources

Diff for: crawling/mergeRoutes.py

+19-16
Original file line numberDiff line numberDiff line change
@@ -68,25 +68,28 @@ def importRouteListJson( co ):
6868
stop_b = stopList[stop_b]
6969
dist = haversine(
7070
(stop_a['location']['lat'], stop_a['location']['lng']),
71-
(stop_b['location']['lat'], stop_b['location']['lng'])
72-
) * 1000 # in meter
71+
(stop_b['location']['lat'], stop_b['location']['lng']),
72+
unit=Unit.METERS # specify that we want distance in metres, default unit is km
73+
)
7374
merge = merge and dist < 300
7475
if merge:
7576
found = True
7677
route['stops'].append((co, _route['stops']))
7778
route['bound'][co] = _route['bound']
78-
for i in range(0, route['seq']):
79-
if route['stops'][0][0] == co:
80-
# skip if same company
81-
continue
82-
if route['stops'][0][1][i] not in stopMap:
83-
stopMap[route['stops'][0][1][i]] = [(co, _route['stops'][i])]
84-
elif (co, _route['stops'][i]) not in stopMap[route['stops'][0][1][i]]:
85-
stopMap[route['stops'][0][1][i]].append( (co, _route['stops'][i]) )
86-
if _route['stops'][i] not in stopMap:
87-
stopMap[_route['stops'][i]] = [(route['stops'][0][0], route['stops'][0][1][i])]
88-
elif (route['stops'][0][0], route['stops'][0][1][i]) not in stopMap[_route['stops'][i]]:
89-
stopMap[_route['stops'][i]].append( (route['stops'][0][0], route['stops'][0][1][i]) )
79+
#### stopMap will be generated by mergeStopList.py, hence commented below ####
80+
# for i in range(0, route['seq']):
81+
# if route['stops'][0][0] == co:
82+
# # skip if same company
83+
# continue
84+
# if route['stops'][0][1][i] not in stopMap:
85+
# stopMap[route['stops'][0][1][i]] = [(co, _route['stops'][i])]
86+
# elif (co, _route['stops'][i]) not in stopMap[route['stops'][0][1][i]]:
87+
# stopMap[route['stops'][0][1][i]].append( (co, _route['stops'][i]) )
88+
# if _route['stops'][i] not in stopMap:
89+
# stopMap[_route['stops'][i]] = [(route['stops'][0][0], route['stops'][0][1][i])]
90+
# elif (route['stops'][0][0], route['stops'][0][1][i]) not in stopMap[_route['stops'][i]]:
91+
# stopMap[_route['stops'][i]].append( (route['stops'][0][0], route['stops'][0][1][i]) )
92+
####
9093

9194
if not found:
9295
routeList.append(
@@ -177,8 +180,8 @@ def standardizeDict(d):
177180
'serviceDayMap': serviceDayMap,
178181
})
179182

180-
with open( 'routeFareList.json', 'w', encoding='UTF-8' ) as f:
183+
with open( 'routeFareList.mergeRoutes.json', 'w', encoding='UTF-8' ) as f:
181184
f.write(json.dumps(db, ensure_ascii=False, indent=4))
182185

183-
with open( 'routeFareList.min.json', 'w', encoding='UTF-8' ) as f:
186+
with open( 'routeFareList.mergeRoutes.min.json', 'w', encoding='UTF-8' ) as f:
184187
f.write(json.dumps(db, ensure_ascii=False, separators=(',', ':')))

Diff for: crawling/mergeStopList.py

+236
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
import logging
2+
import math
3+
import json
4+
import time
5+
from haversine import haversine, Unit
6+
7+
def get_stop_group(route_list, stop_list, stop_seq_mapping, stop_list_grid, stop_id):
8+
DISTANCE_THRESHOLD = 50 # in metres
9+
BEARING_THRESHOLD = 45 # in degrees
10+
STOP_LIST_LIMIT = 50 # max number of stops in a group
11+
12+
def get_stops_haversine_distance(stop_a, stop_b):
13+
return haversine(
14+
(stop_a['location']['lat'], stop_a['location']['lng']),
15+
(stop_b['location']['lat'], stop_b['location']['lng']),
16+
unit=Unit.METERS # specify that we want distance in meter, default is km
17+
)
18+
19+
bearing_targets = stop_seq_mapping.get(stop_id, {}).get('bearings', [])
20+
21+
def is_bearing_in_range(bearing):
22+
if BEARING_THRESHOLD >= 180 or not bearing_targets:
23+
return True
24+
for target in bearing_targets:
25+
bearing_min = target - BEARING_THRESHOLD
26+
bearing_max = target + BEARING_THRESHOLD
27+
if bearing_min < 0:
28+
bearing_min += 360
29+
if bearing_max > 360:
30+
bearing_max -= 360
31+
if (bearing_min <= bearing <= bearing_max or
32+
(bearing_min > bearing_max and (bearing <= bearing_max or bearing >= bearing_min))):
33+
return True
34+
return False
35+
36+
def search_nearby_stops(target_stop_id, excluded_stop_id_list):
37+
target_stop = stop_list[target_stop_id]
38+
# take lat/lng up to 3 decimal places, that's about 100m x 100m square
39+
lat = int(target_stop['location']['lat'] * 1000)
40+
lng = int(target_stop['location']['lng'] * 1000)
41+
42+
nearby_stops = []
43+
for stop_id in stop_list_grid.get(f"{lat}_{lng}", []):
44+
if (stop_id not in excluded_stop_id_list and get_stops_haversine_distance(target_stop, stop_list[stop_id]) <= DISTANCE_THRESHOLD):
45+
bearings = stop_seq_mapping.get(stop_id, {}).get('bearings', [])
46+
if any(is_bearing_in_range(b) for b in bearings):
47+
nearby_stops.append({
48+
'id': stop_id,
49+
'co': stop_seq_mapping.get(stop_id, {}).get('co', '')
50+
})
51+
return nearby_stops
52+
53+
stop_group = []
54+
stop_list_entries = search_nearby_stops(stop_id, [])
55+
56+
# recursively search for nearby stops within thresholds (distance and bearing)
57+
# stop searching when no new stops are found within range, or when stop list is getting too large
58+
i = 0
59+
while i < len(stop_list_entries):
60+
entry = stop_list_entries[i]
61+
stop_group.append([entry['co'], entry['id']])
62+
i += 1
63+
if len(stop_list_entries) < STOP_LIST_LIMIT:
64+
stop_list_entries.extend(search_nearby_stops(entry['id'], [e['id'] for e in stop_list_entries]))
65+
66+
# to reduce size of routeFareList.min.json, excl current stop_id from final output stopMap
67+
return [stop for stop in stop_group if stop[1] != stop_id]
68+
# return stop_group
69+
70+
def get_bearing(a, b):
71+
φ1 = math.radians(a['lat'])
72+
φ2 = math.radians(b['lat'])
73+
λ1 = math.radians(a['lng'])
74+
λ2 = math.radians(b['lng'])
75+
76+
y = math.sin(λ2 - λ1) * math.cos(φ2)
77+
x = (math.cos(φ1) * math.sin(φ2) -
78+
math.sin(φ1) * math.cos(φ2) * math.cos(λ2 - λ1))
79+
θ = math.atan2(y, x)
80+
brng = (math.degrees(θ) + 360) % 360 # in degrees
81+
return brng
82+
83+
def get_stop_bearings(route_stops):
84+
unique_routes = []
85+
bearings = []
86+
for route_stop in route_stops:
87+
if route_stop['bearing'] != -1:
88+
unique_route = f"{route_stop['co']}_{route_stop['routeKey'].split('+')[0]}_{route_stop['bearing']}"
89+
if unique_route not in unique_routes:
90+
unique_routes.append(unique_route)
91+
bearings.append(route_stop['bearing'])
92+
93+
if not bearings:
94+
return []
95+
96+
BEARING_THRESHOLD = 45 # in degrees
97+
BEARING_EPSILON = 10e-6 # very small number
98+
bearing_groups = []
99+
100+
for bearing in bearings:
101+
if bearing == -1:
102+
continue
103+
if not bearing_groups:
104+
bearing_groups.append([bearing])
105+
continue
106+
107+
for group in bearing_groups:
108+
if any(abs(b - bearing) < BEARING_EPSILON for b in group):
109+
break
110+
if any(abs(b - bearing) <= BEARING_THRESHOLD or abs(b - bearing) >= 360 - BEARING_THRESHOLD for b in group):
111+
group.append(bearing)
112+
break
113+
else:
114+
bearing_groups.append([bearing])
115+
116+
if len(bearing_groups) == 1:
117+
return bearing_groups[0]
118+
119+
longest_length = max(len(group) for group in bearing_groups)
120+
return [b for group in bearing_groups if len(group) == longest_length for b in group]
121+
122+
# Main function to process stops
123+
def merge_stop_list():
124+
# Read the result from previous pipeline
125+
with open('routeFareList.mergeRoutes.min.json', 'r', encoding='UTF-8') as f:
126+
db = json.load(f)
127+
128+
route_list = db['routeList']
129+
stop_list = db['stopList']
130+
start_time = time.time()
131+
stop_seq_mapping = {}
132+
133+
# Preprocess the list of bearings for each stop
134+
for route_key, route_list_entry in route_list.items():
135+
stops = route_list_entry.get('stops', {})
136+
for co, co_stops in stops.items():
137+
for stop_pos, stop_id in enumerate(co_stops):
138+
if stop_id not in stop_seq_mapping:
139+
stop_seq_mapping[stop_id] = {"routeStops": [], "co": co, "bearings": []}
140+
if stop_pos == len(co_stops) - 1:
141+
stop_seq_mapping[stop_id]['routeStops'].append({
142+
'routeKey': route_key,
143+
'co': co,
144+
'seq': stop_pos,
145+
'bearing': -1
146+
})
147+
else:
148+
bearing = get_bearing(stop_list[stop_id]['location'], stop_list[co_stops[stop_pos + 1]]['location'])
149+
stop_seq_mapping[stop_id]['routeStops'].append({
150+
'routeKey': route_key,
151+
'co': co,
152+
'seq': stop_pos,
153+
'bearing': bearing
154+
})
155+
156+
for stop_id in stop_seq_mapping.keys():
157+
stop_seq_mapping[stop_id]['bearings'] = get_stop_bearings(stop_seq_mapping[stop_id]['routeStops'])
158+
159+
# Just dump the json in case of a need for trouble-shooting, but otherwise we do not need this file
160+
with open('stopMap.routeStopsSequence.json', 'w', encoding='UTF-8') as f:
161+
json.dump(stop_seq_mapping, f)
162+
163+
logger.info(f'Processed routeStopsSequence in {(time.time() - start_time) * 1000:.2f}ms')
164+
165+
# Preprocess stopList, organise stops into ~100m x ~100m squares to reduce size of nested loop later
166+
stop_list_grid = {}
167+
for stop_id, stop in stop_list.items():
168+
# take lat/lng up to 3 decimal places, that's about 100m x 100m square
169+
lat = int(stop['location']['lat'] * 1000)
170+
lng = int(stop['location']['lng'] * 1000)
171+
# add stop into the 9 grid boxes surrounding this stop
172+
grid = [
173+
f"{lat - 1}_{lng - 1}",
174+
f"{lat }_{lng - 1}",
175+
f"{lat + 1}_{lng - 1}",
176+
f"{lat - 1}_{lng }",
177+
f"{lat }_{lng }",
178+
f"{lat + 1}_{lng }",
179+
f"{lat - 1}_{lng + 1}",
180+
f"{lat }_{lng + 1}",
181+
f"{lat + 1}_{lng + 1}",
182+
]
183+
for grid_id in grid:
184+
if grid_id not in stop_list_grid:
185+
stop_list_grid[grid_id] = []
186+
stop_list_grid[grid_id].append(stop_id)
187+
188+
target_stop_list = list(stop_list.items())
189+
stop_map = {}
190+
count = 0
191+
group_count = 0
192+
193+
for stop_id, stop in target_stop_list:
194+
count += 1
195+
# if count % 1000 == 0:
196+
# logger.info(f"Processed {count} stops ({group_count} groups) at {(time.time() - start_time) * 1000:.2f}ms")
197+
198+
stop_group = get_stop_group(route_list, stop_list, stop_seq_mapping, stop_list_grid, stop_id)
199+
if len(stop_group) > 0:
200+
group_count += 1
201+
stop_map[stop_id] = stop_group
202+
203+
logger.info(f"Processed {count} stops ({group_count} groups) at {(time.time() - start_time) * 1000:.2f}ms")
204+
205+
with open('stopMap.json', 'w', encoding='UTF-8') as f:
206+
json.dump(stop_map, f, indent=4)
207+
208+
db['stopMap'] = stop_map
209+
210+
with open('routeFareList.json', 'w', encoding='UTF-8') as f:
211+
json.dump(db, f, indent=4)
212+
213+
# reduce size of routeFareList.min.json by rounding lat/lng values to 5 decimal places
214+
# 5 d.p. is roughly one-metre accuracy, it is good enough for this project
215+
# saves around 50kb in size for 14,000 stops
216+
for stop_id, stop in target_stop_list:
217+
stop_list[stop_id]['location']['lat'] = float('%.5f' % (stop_list[stop_id]['location']['lat']))
218+
stop_list[stop_id]['location']['lng'] = float('%.5f' % (stop_list[stop_id]['location']['lng']))
219+
220+
db['stopList'] = stop_list
221+
222+
logger.info(f"Reduced location lat/lng to 5 d.p. at {(time.time() - start_time) * 1000:.2f}ms")
223+
224+
with open('routeFareList.alpha.json', 'w', encoding='UTF-8') as f:
225+
json.dump(db, f, indent=4)
226+
227+
with open('routeFareList.min.json', 'w', encoding='UTF-8') as f:
228+
json.dump(db, f)
229+
230+
with open('routeFareList.alpha.min.json', 'w', encoding='UTF-8') as f:
231+
json.dump(db, f)
232+
233+
if __name__ == "__main__":
234+
logging.basicConfig(level=logging.INFO)
235+
logger = logging.getLogger(__name__)
236+
merge_stop_list()

Diff for: crawling/requirements.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ certifi==2020.12.5
33
cffi==1.15.0
44
chardet==4.0.0
55
cryptography==3.4.7
6-
haversine==2.3.0
6+
haversine>=2.3.0
77
idna==2.10
88
pycparser==2.20
99
pyOpenSSL==20.0.1
@@ -12,7 +12,7 @@ PySocks==1.7.1
1212
six==1.15.0
1313
urllib3==1.26.4
1414
wheel==0.36.2
15-
pyproj==3.3.0
15+
pyproj>=3.6.1
1616
httpx==0.25.2
1717
xxhash==3.2.0
1818
-e .

0 commit comments

Comments
 (0)