Skip to content

Commit 33cee19

Browse files
fix: HogQL inefficient timestamp filter (PostHog#28912)
HogQL's toDateTime function becomes ClickHouse's toDateTime64 if the input is a literal in ISO format Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
1 parent a05856b commit 33cee19

30 files changed

+1420
-1345
lines changed

ee/clickhouse/views/test/__snapshots__/test_clickhouse_stickiness.ambr

Lines changed: 16 additions & 16 deletions
Large diffs are not rendered by default.

ee/clickhouse/views/test/__snapshots__/test_clickhouse_trends.ambr

Lines changed: 28 additions & 28 deletions
Large diffs are not rendered by default.

posthog/api/test/__snapshots__/test_insight.ambr

Lines changed: 16 additions & 16 deletions
Large diffs are not rendered by default.

posthog/hogql/database/schema/util/test/test_session_v2_where_clause_extractor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -527,12 +527,12 @@ def test_session_breakdown(self):
527527
FROM
528528
raw_sessions
529529
WHERE
530-
and(equals(raw_sessions.team_id, <TEAM_ID>), ifNull(greaterOrEquals(plus(fromUnixTimestamp(intDiv(toUInt64(bitShiftRight(raw_sessions.session_id_v7, 80)), 1000)), toIntervalDay(3)), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_3)s, 6, %(hogql_val_4)s)))), 0), ifNull(lessOrEquals(minus(fromUnixTimestamp(intDiv(toUInt64(bitShiftRight(raw_sessions.session_id_v7, 80)), 1000)), toIntervalDay(3)), assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_5)s, 6, %(hogql_val_6)s))), 0))
530+
and(equals(raw_sessions.team_id, <TEAM_ID>), ifNull(greaterOrEquals(plus(fromUnixTimestamp(intDiv(toUInt64(bitShiftRight(raw_sessions.session_id_v7, 80)), 1000)), toIntervalDay(3)), toStartOfDay(assumeNotNull(toDateTime64(%(hogql_val_3)s, 6, %(hogql_val_4)s)))), 0), ifNull(lessOrEquals(minus(fromUnixTimestamp(intDiv(toUInt64(bitShiftRight(raw_sessions.session_id_v7, 80)), 1000)), toIntervalDay(3)), assumeNotNull(toDateTime64(%(hogql_val_5)s, 6, %(hogql_val_6)s))), 0))
531531
GROUP BY
532532
raw_sessions.session_id_v7,
533533
raw_sessions.session_id_v7) AS e__session ON equals(toUInt128(accurateCastOrNull(e.`$session_id`, %(hogql_val_7)s)), e__session.session_id_v7)
534534
WHERE
535-
and(equals(e.team_id, <TEAM_ID>), and(greaterOrEquals(toTimeZone(e.timestamp, %(hogql_val_20)s), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_21)s, 6, %(hogql_val_22)s)))), lessOrEquals(toTimeZone(e.timestamp, %(hogql_val_23)s), assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_24)s, 6, %(hogql_val_25)s))), equals(e.event, %(hogql_val_26)s), ifNull(in(if(not(empty(e__override.distinct_id)), e__override.person_id, e.person_id), (SELECT
535+
and(equals(e.team_id, <TEAM_ID>), and(greaterOrEquals(toTimeZone(e.timestamp, %(hogql_val_20)s), toStartOfDay(assumeNotNull(toDateTime64(%(hogql_val_21)s, 6, %(hogql_val_22)s)))), lessOrEquals(toTimeZone(e.timestamp, %(hogql_val_23)s), assumeNotNull(toDateTime64(%(hogql_val_24)s, 6, %(hogql_val_25)s))), equals(e.event, %(hogql_val_26)s), ifNull(in(if(not(empty(e__override.distinct_id)), e__override.person_id, e.person_id), (SELECT
536536
cohortpeople.person_id AS person_id
537537
FROM
538538
cohortpeople

posthog/hogql/database/schema/util/test/test_session_where_clause_extractor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -495,12 +495,12 @@ def test_session_breakdown(self):
495495
FROM
496496
sessions
497497
WHERE
498-
and(equals(sessions.team_id, <TEAM_ID>), greaterOrEquals(plus(toTimeZone(sessions.min_timestamp, %(hogql_val_3)s), toIntervalDay(3)), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_4)s, 6, %(hogql_val_5)s)))), lessOrEquals(minus(toTimeZone(sessions.min_timestamp, %(hogql_val_6)s), toIntervalDay(3)), assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_7)s, 6, %(hogql_val_8)s))))
498+
and(equals(sessions.team_id, <TEAM_ID>), greaterOrEquals(plus(toTimeZone(sessions.min_timestamp, %(hogql_val_3)s), toIntervalDay(3)), toStartOfDay(assumeNotNull(toDateTime64(%(hogql_val_4)s, 6, %(hogql_val_5)s)))), lessOrEquals(minus(toTimeZone(sessions.min_timestamp, %(hogql_val_6)s), toIntervalDay(3)), assumeNotNull(toDateTime64(%(hogql_val_7)s, 6, %(hogql_val_8)s))))
499499
GROUP BY
500500
sessions.session_id,
501501
sessions.session_id) AS e__session ON equals(e.`$session_id`, e__session.session_id)
502502
WHERE
503-
and(equals(e.team_id, <TEAM_ID>), and(greaterOrEquals(toTimeZone(e.timestamp, %(hogql_val_21)s), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_22)s, 6, %(hogql_val_23)s)))), lessOrEquals(toTimeZone(e.timestamp, %(hogql_val_24)s), assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_25)s, 6, %(hogql_val_26)s))), equals(e.event, %(hogql_val_27)s), ifNull(in(if(not(empty(e__override.distinct_id)), e__override.person_id, e.person_id), (SELECT
503+
and(equals(e.team_id, <TEAM_ID>), and(greaterOrEquals(toTimeZone(e.timestamp, %(hogql_val_21)s), toStartOfDay(assumeNotNull(toDateTime64(%(hogql_val_22)s, 6, %(hogql_val_23)s)))), lessOrEquals(toTimeZone(e.timestamp, %(hogql_val_24)s), assumeNotNull(toDateTime64(%(hogql_val_25)s, 6, %(hogql_val_26)s))), equals(e.event, %(hogql_val_27)s), ifNull(in(if(not(empty(e__override.distinct_id)), e__override.person_id, e.person_id), (SELECT
504504
cohortpeople.person_id AS person_id
505505
FROM
506506
cohortpeople

posthog/hogql/database/schema/util/where_clause_extractor.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -403,12 +403,13 @@ def visit_arithmetic_operation(self, node: ast.ArithmeticOperation) -> bool:
403403

404404
def visit_call(self, node: ast.Call) -> bool:
405405
# some functions just return a constant
406-
if node.name in ["today", "now"]:
406+
if node.name in ["today", "now", "now64", "yesterday"]:
407407
return True
408408
# some functions return a constant if the first argument is a constant
409409
if node.name in [
410410
"parseDateTime64BestEffortOrNull",
411411
"toDateTime",
412+
"toDateTime64",
412413
"toTimeZone",
413414
"assumeNotNull",
414415
"toIntervalYear",

posthog/hogql/functions/mapping.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -408,13 +408,16 @@ def compare_types(arg_types: list[ConstantType], sig_arg_types: tuple[ConstantTy
408408
"toDateTime": HogQLFunctionMeta(
409409
"parseDateTime64BestEffortOrNull",
410410
1,
411-
2,
411+
2, # Incorrect for parseDateTime64BestEffortOrNull but it is required because when we overload to toDateTime, we use this to figure out if timestamp is already in a function.
412412
tz_aware=True,
413-
overloads=[((ast.DateTimeType, ast.DateType, ast.IntegerType), "toDateTime")],
413+
overloads=[
414+
((ast.DateTimeType, ast.DateType, ast.IntegerType), "toDateTime"),
415+
# ((ast.StringType,), "parseDateTime64"), # missing in version: 24.8.7.41
416+
],
414417
signatures=[
415-
((StringType(),), DateTimeType(nullable=True)),
416-
((StringType(), IntegerType()), DateTimeType(nullable=True)),
417-
((StringType(), IntegerType(), StringType()), DateTimeType(nullable=True)),
418+
((StringType(),), DateTimeType()),
419+
((StringType(), IntegerType()), DateTimeType()),
420+
((StringType(), IntegerType(), StringType()), DateTimeType()),
418421
],
419422
),
420423
"toUUID": HogQLFunctionMeta("accurateCastOrNull", 1, 1, suffix_args=[ast.Constant(value="UUID")]),

posthog/hogql/printer.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import re
22
from collections.abc import Iterable
33
from dataclasses import dataclass
4+
from dateutil import parser
45
from datetime import date, datetime
56
from difflib import get_close_matches
67
from typing import Literal, Optional, Union, cast
@@ -13,6 +14,7 @@
1314
)
1415
from posthog.clickhouse.property_groups import property_groups
1516
from posthog.hogql import ast
17+
from posthog.hogql.ast import StringType, Constant
1618
from posthog.hogql.base import _T_AST, AST
1719
from posthog.hogql.constants import (
1820
MAX_SELECT_RETURNED_ROWS,
@@ -1137,11 +1139,25 @@ def visit_call(self, node: ast.Call):
11371139
if not has_tz_override:
11381140
args.append(self.visit(ast.Constant(value=self._get_timezone())))
11391141

1142+
# If the datetime is in correct format, use optimal toDateTime, it's more strict but faster
1143+
# and it allows CH to use index efficiently.
1144+
if (
1145+
relevant_clickhouse_name == "parseDateTime64BestEffortOrNull"
1146+
and len(node.args) == 1
1147+
and isinstance(node.args[0], Constant)
1148+
and isinstance(node.args[0].type, StringType)
1149+
):
1150+
try:
1151+
_ = parser.isoparse(node.args[0].value)
1152+
relevant_clickhouse_name = "toDateTime64"
1153+
except ValueError:
1154+
pass
1155+
11401156
if (
11411157
relevant_clickhouse_name == "now64"
11421158
and (len(node.args) == 0 or (has_tz_override and len(node.args) == 1))
11431159
) or (
1144-
relevant_clickhouse_name == "parseDateTime64BestEffortOrNull"
1160+
relevant_clickhouse_name in ("parseDateTime64BestEffortOrNull", "toDateTime64")
11451161
and (len(node.args) == 1 or (has_tz_override and len(node.args) == 2))
11461162
):
11471163
# These two CH functions require a precision argument before timezone

posthog/hogql/test/__snapshots__/test_query.ambr

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,30 @@
11
# serializer version: 1
2+
# name: TestQuery.test_clickhouse_timestamp_handling
3+
'''
4+
-- ClickHouse
5+
6+
SELECT if(not(empty(events__exception_issue_override.issue_id)), events__exception_issue_override.issue_id, accurateCastOrNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(events.properties, %(hogql_val_1)s), ''), 'null'), '^"|"$', ''), %(hogql_val_2)s)) AS id, count(DISTINCT events.uuid) AS occurrences, count(DISTINCT nullIf(events.`$session_id`, %(hogql_val_3)s)) AS sessions, count(DISTINCT events.distinct_id) AS users, max(toTimeZone(events.timestamp, %(hogql_val_4)s)) AS last_seen, min(toTimeZone(events.timestamp, %(hogql_val_5)s)) AS first_seen, reverse(arrayMap(x -> countEqual(groupArray(dateDiff(%(hogql_val_6)s, toStartOfHour(toTimeZone(events.timestamp, %(hogql_val_7)s)), toStartOfHour(now64(6, %(hogql_val_8)s)))), x), range(24))) AS volumeDay, reverse(arrayMap(x -> countEqual(groupArray(dateDiff(%(hogql_val_9)s, toStartOfDay(toTimeZone(events.timestamp, %(hogql_val_10)s)), toStartOfDay(now64(6, %(hogql_val_11)s)))), x), range(31))) AS volumeMonth, reverse(arrayMap(x -> countEqual(groupArray(dateDiff(%(hogql_val_12)s, toStartOfHour(toTimeZone(events.timestamp, %(hogql_val_13)s)), toStartOfHour(now64(6, %(hogql_val_14)s)))), x), range(168))) AS customVolume
7+
FROM events LEFT OUTER JOIN (
8+
SELECT argMax(error_tracking_issue_fingerprint_overrides.issue_id, error_tracking_issue_fingerprint_overrides.version) AS issue_id, error_tracking_issue_fingerprint_overrides.fingerprint AS fingerprint
9+
FROM error_tracking_issue_fingerprint_overrides
10+
WHERE equals(error_tracking_issue_fingerprint_overrides.team_id, 99999)
11+
GROUP BY error_tracking_issue_fingerprint_overrides.fingerprint
12+
HAVING ifNull(equals(argMax(error_tracking_issue_fingerprint_overrides.is_deleted, error_tracking_issue_fingerprint_overrides.version), 0), 0)
13+
SETTINGS optimize_aggregation_in_order=1) AS events__exception_issue_override ON equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(events.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', ''), events__exception_issue_override.fingerprint)
14+
WHERE and(equals(events.team_id, 99999), and(equals(events.event, %(hogql_val_15)s), isNotNull(if(not(empty(events__exception_issue_override.issue_id)), events__exception_issue_override.issue_id, accurateCastOrNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(events.properties, %(hogql_val_16)s), ''), 'null'), '^"|"$', ''), %(hogql_val_17)s))), and(1, greaterOrEquals(toTimeZone(events.timestamp, %(hogql_val_18)s), toDateTime64(%(hogql_val_19)s, 6, %(hogql_val_20)s)))))
15+
GROUP BY if(not(empty(events__exception_issue_override.issue_id)), events__exception_issue_override.issue_id, accurateCastOrNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(events.properties, %(hogql_val_21)s), ''), 'null'), '^"|"$', ''), %(hogql_val_22)s)) ORDER BY occurrences DESC
16+
LIMIT 51 OFFSET 0
17+
SETTINGS readonly=2, max_execution_time=60, allow_experimental_object_type=1, format_csv_allow_double_quotes=0, max_ast_elements=4000000, max_expanded_ast_elements=4000000, max_bytes_before_external_group_by=0
18+
19+
-- HogQL
20+
21+
SELECT issue_id AS id, count(DISTINCT uuid) AS occurrences, count(DISTINCT nullIf($session_id, '')) AS sessions, count(DISTINCT distinct_id) AS users, max(timestamp) AS last_seen, min(timestamp) AS first_seen, reverse(arrayMap(x -> countEqual(groupArray(dateDiff('hour', toStartOfHour(timestamp), toStartOfHour(now()))), x), range(24))) AS volumeDay, reverse(arrayMap(x -> countEqual(groupArray(dateDiff('day', toStartOfDay(timestamp), toStartOfDay(now()))), x), range(31))) AS volumeMonth, reverse(arrayMap(x -> countEqual(groupArray(dateDiff('hour', toStartOfHour(timestamp), toStartOfHour(now()))), x), range(168))) AS customVolume
22+
FROM events
23+
WHERE and(equals(event, '$exception'), isNotNull(issue_id), and(1, greaterOrEquals(timestamp, toDateTime('2025-02-10 23:53:03.175952'))))
24+
GROUP BY issue_id ORDER BY occurrences DESC
25+
LIMIT 51 OFFSET 0
26+
'''
27+
# ---
228
# name: TestQuery.test_hogql_arrays
329
'''
430
-- ClickHouse

posthog/hogql/test/test_printer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1369,7 +1369,7 @@ def test_print_timezone(self):
13691369
"SELECT now(), toDateTime(timestamp), toDate(test_date), toDateTime('2020-02-02') FROM events",
13701370
context,
13711371
),
1372-
f"SELECT now64(6, %(hogql_val_0)s), toDateTime(toTimeZone(events.timestamp, %(hogql_val_1)s), %(hogql_val_2)s), toDate(events.test_date), parseDateTime64BestEffortOrNull(%(hogql_val_3)s, 6, %(hogql_val_4)s) FROM events WHERE equals(events.team_id, {self.team.pk}) LIMIT {MAX_SELECT_RETURNED_ROWS}",
1372+
f"SELECT now64(6, %(hogql_val_0)s), toDateTime(toTimeZone(events.timestamp, %(hogql_val_1)s), %(hogql_val_2)s), toDate(events.test_date), toDateTime64(%(hogql_val_3)s, 6, %(hogql_val_4)s) FROM events WHERE equals(events.team_id, {self.team.pk}) LIMIT {MAX_SELECT_RETURNED_ROWS}",
13731373
)
13741374
self.assertEqual(
13751375
context.values,
@@ -1391,7 +1391,7 @@ def test_print_timezone_custom(self):
13911391
"SELECT now(), toDateTime(timestamp), toDateTime('2020-02-02') FROM events",
13921392
context,
13931393
),
1394-
f"SELECT now64(6, %(hogql_val_0)s), toDateTime(toTimeZone(events.timestamp, %(hogql_val_1)s), %(hogql_val_2)s), parseDateTime64BestEffortOrNull(%(hogql_val_3)s, 6, %(hogql_val_4)s) FROM events WHERE equals(events.team_id, {self.team.pk}) LIMIT {MAX_SELECT_RETURNED_ROWS}",
1394+
f"SELECT now64(6, %(hogql_val_0)s), toDateTime(toTimeZone(events.timestamp, %(hogql_val_1)s), %(hogql_val_2)s), toDateTime64(%(hogql_val_3)s, 6, %(hogql_val_4)s) FROM events WHERE equals(events.team_id, {self.team.pk}) LIMIT {MAX_SELECT_RETURNED_ROWS}",
13951395
)
13961396
self.assertEqual(
13971397
context.values,
@@ -1478,7 +1478,7 @@ def test_to_start_of_week_gets_mode(self):
14781478
def test_functions_expecting_datetime_arg(self):
14791479
self.assertEqual(
14801480
self._expr("tumble(toDateTime('2023-06-12'), toIntervalDay('1'))"),
1481-
f"tumble(assumeNotNull(toDateTime(parseDateTime64BestEffortOrNull(%(hogql_val_0)s, 6, %(hogql_val_1)s))), toIntervalDay(%(hogql_val_2)s))",
1481+
f"tumble(assumeNotNull(toDateTime(toDateTime64(%(hogql_val_0)s, 6, %(hogql_val_1)s))), toIntervalDay(%(hogql_val_2)s))",
14821482
)
14831483
self.assertEqual(
14841484
self._expr("tumble(now(), toIntervalDay('1'))"),

0 commit comments

Comments
 (0)