transcriptions = [
{"record_id": "j1", "provider_id": "p1", "media_path": "a.wav", "metadata": '{"batch_id": "b1"}'},
{"record_id": "j2", "provider_id": "p1", "media_path": "a.wav", "metadata": '{"batch_id": "b1"}'},
{"record_id": "j3", "provider_id": "p2", "media_path": "b.wav", "metadata": '{"batch_id": "b2"}'},
]
# Select all by media_path (no exclusion)
result = select_all_in_group(transcriptions, "a.wav", "media_path", [])
assert len(result) == 2
assert result[0]["record_id"] == "j1"
assert result[1]["record_id"] == "j2"
# Select all by batch_id
result = select_all_in_group(transcriptions, "b1", "batch_id", [])
assert len(result) == 2
# Deduplication: j1/p1 already selected
result = select_all_in_group(transcriptions, "a.wav", "media_path", [{"record_id": "j1", "provider_id": "p1"}])
assert len(result) == 2
assert result[0]["record_id"] == "j1"
assert result[1]["record_id"] == "j2"
# Same record_id from different provider is NOT a duplicate
result = select_all_in_group(transcriptions, "a.wav", "media_path", [{"record_id": "j1", "provider_id": "p_other"}])
assert len(result) == 3 # existing + j1/p1 + j2/p1
# No matches
result = select_all_in_group(transcriptions, "nonexistent.wav", "media_path", [])
assert len(result) == 0
# With excluded_media_paths: skip sources whose audio is already represented
result = select_all_in_group(transcriptions, "a.wav", "media_path", [], excluded_media_paths={"a.wav"})
assert len(result) == 0 # All matching records share excluded media_path
# excluded_media_paths with batch_id grouping across different audio files
mixed = [
{"record_id": "j1", "provider_id": "p1", "media_path": "a.wav", "metadata": '{"batch_id": "b1"}'},
{"record_id": "j2", "provider_id": "p1", "media_path": "b.wav", "metadata": '{"batch_id": "b1"}'},
]
result = select_all_in_group(mixed, "b1", "batch_id", [], excluded_media_paths={"a.wav"})
assert len(result) == 1
assert result[0]["record_id"] == "j2" # Only b.wav source added
print("select_all_in_group tests passed")