Skip to content
Closed
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
2955411
feat: added instrumentation to all rpcs
daniel-sanche Jan 26, 2024
2ba1321
added system test for attempt_latencies metrics
daniel-sanche Jan 31, 2024
87bd1db
added tests for other latency types
daniel-sanche Jan 31, 2024
6f79b9a
added system tests for all latency metrics
daniel-sanche Jan 31, 2024
d736503
added tests for counts
daniel-sanche Jan 31, 2024
20260dc
Merge branch 'client_side_metrics_handlers' into client_side_metrics_…
daniel-sanche Jan 31, 2024
1058fad
Merge branch 'client_side_metrics_handlers' into client_side_metrics_…
daniel-sanche Feb 1, 2024
67b545e
catch timed out operations
daniel-sanche Feb 2, 2024
67a6fcd
fixed bug in retry detection
daniel-sanche Feb 2, 2024
ef0fdd8
reworking metric system tests
daniel-sanche Feb 3, 2024
7ee253e
got first test working
daniel-sanche Feb 3, 2024
28d162b
fixed test
daniel-sanche Feb 5, 2024
6caba34
added system tests
daniel-sanche Feb 6, 2024
8ac92d0
fixed bug in parsing bulk mutations errors
daniel-sanche Feb 6, 2024
7a95c4a
fixed up test
daniel-sanche Feb 6, 2024
c96b534
improved tests
daniel-sanche Feb 6, 2024
86159a4
added missing end_op
daniel-sanche Feb 7, 2024
89c5216
fixed test
daniel-sanche Feb 7, 2024
fcd3aaa
fixed lint
daniel-sanche Feb 7, 2024
f2528ae
combined wrapped predicate with wrapped exc factory
daniel-sanche Feb 7, 2024
eb3aae1
fixed blacken
daniel-sanche Feb 8, 2024
1a08f1a
fixed failing test
daniel-sanche Feb 8, 2024
fdc2e3b
improved exception parsing
daniel-sanche Feb 8, 2024
e428bf0
removed export interval option
daniel-sanche Feb 9, 2024
1bf566c
changed buckets
daniel-sanche Feb 9, 2024
ca0963a
Merge branch 'client_side_metrics_handlers' into client_side_metrics_…
daniel-sanche Feb 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
improved tests
  • Loading branch information
daniel-sanche committed Feb 6, 2024
commit c96b5347b5feba440fafb033f0a233d0a37dc388
63 changes: 37 additions & 26 deletions tests/system/data/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,14 +203,10 @@ async def test_resource(get_all_metrics, instance_id, table_id, project_id):
assert resource.labels["instance"] == instance_id
assert resource.labels["table"] == table_id
assert resource.labels["project_id"] == project_id
if 'success' in m.metric.labels["app_profile"]:
# for attempts that succeeded, zone and cluster should be populated
assert resource.labels["zone"] == TEST_ZONE
assert resource.labels["cluster"] == TEST_CLUSTER
else:
# others should fall back to defaults
assert resource.labels["zone"] == "global"
assert resource.labels["cluster"] == "unspecified"
# zone and cluster should use default values for failed attempts
assert resource.labels["zone"] in [TEST_ZONE, 'global']
assert resource.labels["cluster"] in [TEST_CLUSTER, 'unspecified']


@pytest.mark.asyncio
async def test_client_name(get_all_metrics):
Expand Down Expand Up @@ -378,7 +374,7 @@ async def test_status_exception(get_all_metrics):
"""
check the subset of rpcs with a single terminal exception

They should have no retries, 1 connectivity errors, a status of NOT_FOUND
They should have no retries, 1+ connectivity errors, a status of NOT_FOUND
Should have default values for cluster and zone
"""
fail_metrics = [m for m in get_all_metrics if m.metric.labels["app_profile"] == "terminal_exception"]
Expand All @@ -399,12 +395,16 @@ async def test_status_exception(get_all_metrics):
# check for cluster and zone
assert m.resource.labels["zone"] == 'global'
assert m.resource.labels["cluster"] == 'unspecified'
# ensure connectivity error count is 1
# each rpc should have at least one connectivity error
# ReadRows will have more, since we test point reads and streams
connectivity_error_counts = [m for m in fail_metrics if "connectivity_error_count" in m.metric.type]
assert len(connectivity_error_counts) > 0
for m in connectivity_error_counts:
for pt in m.points:
assert pt.value.int64_value == 1
for error_metric in connectivity_error_counts:
total_points = sum([int(pt.value.int64_value) for pt in error_metric.points])
assert total_points >= 1
# ensure each rpc reported connectivity errors
for prc in OperationType:
assert any(m.metric.labels["method"] == prc.value for m in connectivity_error_counts)


@pytest.mark.asyncio
@pytest.mark.parametrize("app_profile,final_status", [
Expand Down Expand Up @@ -435,22 +435,33 @@ async def test_status_retry(get_all_metrics, app_profile, final_status):
server_latencies = [m for m in retry_metrics if "server_latencies" in m.metric.type] # may not be present. only if reached server
first_response_latencies = [m for m in retry_metrics if "first_response_latencies" in m.metric.type] # may not be present

# each retry_count and connectivity_error_count should be 2
for m in retry_counts + connectivity_error_counts:
for pt in m.points:
assert pt.value.int64_value == 2
# should have at least 2 retry attempts
# ReadRows will have more, because it is called multiple times in the test data
for m in retry_counts:
total_errors = sum([int(pt.value.int64_value) for pt in m.points])
assert total_errors >= 2, f"{m} has {total_errors} errors"
# each rpc should have at least one connectivity error
# most will have 2, but will have 1 if status == NOT_FOUND
for m in connectivity_error_counts:
total_errors = sum([int(pt.value.int64_value) for pt in m.points])
assert total_errors >= 1, f"{m} has {total_errors} errors"

# all operation-level status should be final_status
for m in operation_latencies + retry_counts:
assert m.metric.labels["status"] == final_status
# all attempt-level status should have a 2:1 mix of final_status and UNAVAILABLE
status_map = {}
for m in attempt_latencies + server_latencies + first_response_latencies + connectivity_error_counts:
status_map[m.metric.labels["status"]] = status_map.get(m.metric.labels["status"], 0) + 1
assert len(status_map) == 2
assert final_status in status_map
assert "UNAVAILABLE" in status_map
assert len(status_map[final_status]) * 2 == len(status_map["UNAVAILABLE"])

# check attempt statuses
attempt_statuses = set([m.metric.labels["status"] for m in attempt_latencies + server_latencies + first_response_latencies + connectivity_error_counts])
if final_status == "DEADLINE_EXCEEDED":
# operation DEADLINE_EXCEEDED never shows up in attempts
assert len(attempt_statuses) == 1
assert "UNAVAILABLE" in attempt_statuses
else:
# all other attempt-level status should have a mix of final_status and UNAVAILABLE
assert len(attempt_statuses) == 2
assert "UNAVAILABLE" in attempt_statuses
assert final_status in attempt_statuses


@pytest.mark.asyncio
async def test_latency_metric_histogram_buckets(get_all_metrics):
Expand Down