Skip to content
Merged
175 changes: 175 additions & 0 deletions posthog/clickhouse/test/__snapshots__/test_raw_sessions_v3_model.ambr
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
# serializer version: 1
# name: TestRawSessionsModel.test_ad_ids_map_and_set
'''

select *
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_autocapture_does_not_set_attribution_when_pageview_present
'''

select *
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_autocapture_does_set_attribution_when_only_event
'''

select *
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_channel_type_properties
'''

select *
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_counts_pageviews_autocaptures_and_events
'''

select *
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_handles_different_distinct_id_across_same_session
'''

select *
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_handles_entry_and_exit_urls
'''

select *
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_handles_initial_utm_properties
'''

select *
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_it_creates_session_when_creating_event
'''

select session_id_v7,
team_id
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_lookup_feature_flag
'''

select session_id_v7,
has(flag_values['$feature/flag_string'], 'f1_a') as has_f1_a,
has(flag_values['$feature/flag_string'], 'f1_b') as has_f1_b,
has(flag_values['$feature/flag_string'], 'f1_c') as has_f1_c
from raw_sessions_v3_v
where team_id = 99999
ORDER BY session_id_v7
'''
# ---
# name: TestRawSessionsModel.test_max_inserted_at
'''

select *
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_select_from_sessions
'''

SELECT session_id_v7,
team_id,
min_timestamp,
max_timestamp,
urls
FROM raw_sessions_v3
WHERE session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_select_from_sessions_mv
'''

SELECT session_id_v7,
team_id,
min_timestamp,
max_timestamp,
urls
FROM raw_sessions_v3_mv
WHERE session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_separates_sessions_across_same_user
'''

select *
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_separates_sessions_across_same_user.1
'''

select *
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_separates_sessions_across_same_user.2
'''

select *
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_store_all_feature_flag_values
'''

select *
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
# name: TestRawSessionsModel.test_tracks_all_distinct_ids
'''

select *
from raw_sessions_v3_v
where session_id_v7 = toUInt128(toUUID('00000000-0000-0000-0000-000000000000'))
AND team_id = 99999
'''
# ---
68 changes: 47 additions & 21 deletions posthog/clickhouse/test/__snapshots__/test_schema.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -2625,7 +2625,16 @@
CREATE TABLE IF NOT EXISTS raw_sessions_v3
(
team_id Int64,
session_id_v7 UUID,

-- Both UInt128 and UUID are imperfect choices here
-- see https://michcioperz.com/wiki/clickhouse-uuid-ordering/
-- but also see https://github.com/ClickHouse/ClickHouse/issues/77226 and hope
-- right now choose UInt128 as that's the type of events.$session_id_uuid, but in the future we will probably want to switch everything to the new CH UUID type (when it's released)
session_id_v7 UInt128,
-- Ideally we would not need to store this separately, as the ID *is* the timestamp
-- Unfortunately for now, chaining clickhouse functions to extract the timestamp will break indexes / partition pruning, so do this workaround
-- again, when the new CH UUID type is released, we should try to switch to that and remove the separate timestamp column
session_timestamp DateTime64 MATERIALIZED fromUnixTimestamp64Milli(toUInt64(bitShiftRight(session_id_v7, 80))),

-- ClickHouse will pick the latest value of distinct_id for the session
-- this is fine since even if the distinct_id changes during a session
Expand Down Expand Up @@ -2711,7 +2720,7 @@
WITH parsed_events AS (
SELECT
team_id,
`$session_id`,
`$session_id_uuid` AS session_id_v7,
distinct_id AS _distinct_id,
person_id,
timestamp,
Expand Down Expand Up @@ -2841,8 +2850,8 @@
)

SELECT
team_id,
toUUID(`$session_id`) as session_id_v7,
team_id,
session_id_v7,

initializeAggregation('argMaxState', _distinct_id, timestamp) as distinct_id,
initializeAggregation('argMaxState', person_id, timestamp) as person_id,
Expand Down Expand Up @@ -3675,7 +3684,16 @@
CREATE TABLE IF NOT EXISTS sharded_raw_sessions_v3
(
team_id Int64,
session_id_v7 UUID,

-- Both UInt128 and UUID are imperfect choices here
-- see https://michcioperz.com/wiki/clickhouse-uuid-ordering/
-- but also see https://github.com/ClickHouse/ClickHouse/issues/77226 and hope
-- right now choose UInt128 as that's the type of events.$session_id_uuid, but in the future we will probably want to switch everything to the new CH UUID type (when it's released)
session_id_v7 UInt128,
-- Ideally we would not need to store this separately, as the ID *is* the timestamp
-- Unfortunately for now, chaining clickhouse functions to extract the timestamp will break indexes / partition pruning, so do this workaround
-- again, when the new CH UUID type is released, we should try to switch to that and remove the separate timestamp column
session_timestamp DateTime64 MATERIALIZED fromUnixTimestamp64Milli(toUInt64(bitShiftRight(session_id_v7, 80))),

-- ClickHouse will pick the latest value of distinct_id for the session
-- this is fine since even if the distinct_id changes during a session
Expand Down Expand Up @@ -3749,15 +3767,10 @@
flag_values AggregateFunction(groupUniqArrayMap, Map(String, String))
) ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.raw_sessions_v3', '{replica}')

PARTITION BY toYYYYMM(UUIDv7ToDateTime(session_id_v7))
PARTITION BY toYYYYMM(session_timestamp)
ORDER BY (
team_id,

-- sadly we need to include this as clickhouse UUIDs have insane ordering
-- see https://michcioperz.com/wiki/clickhouse-uuid-ordering/
-- but also see https://github.com/ClickHouse/ClickHouse/issues/77226 and hope
UUIDv7ToDateTime(session_id_v7),

session_timestamp,
session_id_v7
)

Expand Down Expand Up @@ -4661,7 +4674,16 @@
CREATE TABLE IF NOT EXISTS writable_raw_sessions_v3
(
team_id Int64,
session_id_v7 UUID,

-- Both UInt128 and UUID are imperfect choices here
-- see https://michcioperz.com/wiki/clickhouse-uuid-ordering/
-- but also see https://github.com/ClickHouse/ClickHouse/issues/77226 and hope
-- right now choose UInt128 as that's the type of events.$session_id_uuid, but in the future we will probably want to switch everything to the new CH UUID type (when it's released)
session_id_v7 UInt128,
-- Ideally we would not need to store this separately, as the ID *is* the timestamp
-- Unfortunately for now, chaining clickhouse functions to extract the timestamp will break indexes / partition pruning, so do this workaround
-- again, when the new CH UUID type is released, we should try to switch to that and remove the separate timestamp column
session_timestamp DateTime64 MATERIALIZED fromUnixTimestamp64Milli(toUInt64(bitShiftRight(session_id_v7, 80))),

-- ClickHouse will pick the latest value of distinct_id for the session
-- this is fine since even if the distinct_id changes during a session
Expand Down Expand Up @@ -5926,7 +5948,16 @@
CREATE TABLE IF NOT EXISTS sharded_raw_sessions_v3
(
team_id Int64,
session_id_v7 UUID,

-- Both UInt128 and UUID are imperfect choices here
-- see https://michcioperz.com/wiki/clickhouse-uuid-ordering/
-- but also see https://github.com/ClickHouse/ClickHouse/issues/77226 and hope
-- right now choose UInt128 as that's the type of events.$session_id_uuid, but in the future we will probably want to switch everything to the new CH UUID type (when it's released)
session_id_v7 UInt128,
-- Ideally we would not need to store this separately, as the ID *is* the timestamp
-- Unfortunately for now, chaining clickhouse functions to extract the timestamp will break indexes / partition pruning, so do this workaround
-- again, when the new CH UUID type is released, we should try to switch to that and remove the separate timestamp column
session_timestamp DateTime64 MATERIALIZED fromUnixTimestamp64Milli(toUInt64(bitShiftRight(session_id_v7, 80))),

-- ClickHouse will pick the latest value of distinct_id for the session
-- this is fine since even if the distinct_id changes during a session
Expand Down Expand Up @@ -6000,15 +6031,10 @@
flag_values AggregateFunction(groupUniqArrayMap, Map(String, String))
) ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.raw_sessions_v3', '{replica}')

PARTITION BY toYYYYMM(UUIDv7ToDateTime(session_id_v7))
PARTITION BY toYYYYMM(session_timestamp)
ORDER BY (
team_id,

-- sadly we need to include this as clickhouse UUIDs have insane ordering
-- see https://michcioperz.com/wiki/clickhouse-uuid-ordering/
-- but also see https://github.com/ClickHouse/ClickHouse/issues/77226 and hope
UUIDv7ToDateTime(session_id_v7),

session_timestamp,
session_id_v7
)

Expand Down
19 changes: 14 additions & 5 deletions posthog/clickhouse/test/test_raw_sessions_v3_model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import datetime

from posthog.test.base import BaseTest, ClickhouseTestMixin, _create_event, flush_persons_and_events
from posthog.test.base import (
BaseTest,
ClickhouseTestMixin,
_create_event,
flush_persons_and_events,
snapshot_clickhouse_queries,
)

from posthog.clickhouse.client import query_with_columns, sync_execute
from posthog.models.raw_sessions.sql_v3 import RAW_SESSION_TABLE_BACKFILL_SQL_V3
Expand All @@ -22,7 +28,10 @@ def create_session_id():
return str(uuid7(random=session_id_counter))


@snapshot_clickhouse_queries
class TestRawSessionsModel(ClickhouseTestMixin, BaseTest):
snapshot_replace_all_numbers = True

def select_by_session_id(self, session_id):
flush_persons_and_events()
return query_with_columns(
Expand All @@ -31,7 +40,7 @@ def select_by_session_id(self, session_id):
*
from raw_sessions_v3_v
where
session_id_v7 = toUUID(%(session_id)s) AND
session_id_v7 = toUInt128(toUUID(%(session_id)s)) AND
team_id = %(team_id)s
""",
{
Expand All @@ -58,7 +67,7 @@ def test_it_creates_session_when_creating_event(self):
team_id
from raw_sessions_v3_v
where
session_id_v7 = toUUID(%(session_id)s) AND
session_id_v7 = toUInt128(toUUID(%(session_id)s)) AND
team_id = %(team_id)s
""",
{
Expand Down Expand Up @@ -252,7 +261,7 @@ def test_select_from_sessions(self):
max_timestamp,
urls
FROM raw_sessions_v3
WHERE session_id_v7 = toUUID(%(session_id)s) AND team_id = %(team_id)s
WHERE session_id_v7 = toUInt128(toUUID(%(session_id)s)) AND team_id = %(team_id)s
""",
{
"session_id": session_id,
Expand Down Expand Up @@ -283,7 +292,7 @@ def test_select_from_sessions_mv(self):
max_timestamp,
urls
FROM raw_sessions_v3_mv
WHERE session_id_v7 = toUUID(%(session_id)s) AND team_id = %(team_id)s
WHERE session_id_v7 = toUInt128(toUUID(%(session_id)s)) AND team_id = %(team_id)s
""",
{
"session_id": session_id,
Expand Down
7 changes: 2 additions & 5 deletions posthog/hogql/database/schema/session_replay_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,7 @@
select_from_sessions_table_v2,
session_id_to_session_id_v7_as_uint128_expr,
)
from posthog.hogql.database.schema.sessions_v3 import (
select_from_sessions_table_v3,
session_id_to_session_id_v7_as_uuid_expr,
)
from posthog.hogql.database.schema.sessions_v3 import select_from_sessions_table_v3, session_id_to_uint128_as_uuid_expr
from posthog.hogql.errors import ResolutionError


Expand Down Expand Up @@ -91,7 +88,7 @@ def join_replay_table_to_sessions_table_v3(
join_expr.constraint = ast.JoinConstraint(
expr=ast.CompareOperation(
op=ast.CompareOperationOp.Eq,
left=session_id_to_session_id_v7_as_uuid_expr(ast.Field(chain=[join_to_add.from_table, "session_id"])),
left=session_id_to_uint128_as_uuid_expr(ast.Field(chain=[join_to_add.from_table, "session_id"])),
right=ast.Field(chain=[join_to_add.to_table, "session_id_v7"]),
),
constraint_type="ON",
Expand Down
Loading
Loading