Skip to content

Commit db448d7

Browse files
authored
Stats: Rebuild session smearing for timeseries (#5694)
* Refactor table_decider#partition_metrics * Refactor query pipeline to return a list of subqueries after splitting * Move order_by out of join logic * Refactor joining logic in query_builder 1. JOIN type is now set in QueryOptimizer 2. JOIN logic is now table and list-size agnostic * Comment an edge case * Rebuild session/visit smearing Previously, whenever graphing any visit metric hourly/realtime, visit_duration and other visit metrics would be way higher than expected, due to long sessions dragging each bucket up and up. Now visits/visitors metrics are still smeared and other visit metrics are counted under last bucket user was active in. visits metric was also overcounted (see new tests). * Remove unneeded case * Unit test for smearing in tabledecider
1 parent bf24ae0 commit db448d7

9 files changed

Lines changed: 336 additions & 168 deletions

File tree

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ All notable changes to this project will be documented in this file.
2727
- Make clicking Compare / Disable Comparison in period picker menu close the menu
2828
- Do not log page views for hidden pages (prerendered pages and new tabs), until pages are viewed
2929
- Password-authenticated shared links now carry over dashboard params properly
30+
- Realtime and hourly graphs of visit duration, views per visit no longer overcount due to long-lasting sessions, instead showing each visit
31+
when they occurred.
32+
- Fixed realtime and hourly graphs of visits overcounting
3033

3134
## v3.0.0 - 2025-04-11
3235

lib/plausible/stats/query.ex

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ defmodule Plausible.Stats.Query do
2828
site_id: nil,
2929
site_native_stats_start_at: nil,
3030
# Contains information to determine how to combine legacy and new time on page metrics
31-
time_on_page_data: %{}
31+
time_on_page_data: %{},
32+
sql_join_type: :left,
33+
smear_session_metrics: false
3234

3335
require OpenTelemetry.Tracer, as: Tracer
3436
alias Plausible.Stats.{DateTimeRange, Filters, Imported, Legacy, Comparisons}

lib/plausible/stats/query_optimizer.ex

Lines changed: 50 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ defmodule Plausible.Stats.QueryOptimizer do
2626
4. Updates event:hostname filters to also apply on visit level for sane results.
2727
5. Removes revenue metrics from dashboard queries if not requested, present or unavailable for the site.
2828
6. Trims the date range to the current time if query.include.trim_relative_date_range is true.
29+
7. Sets the join_type for the query based on the query.
2930
3031
"""
3132
def optimize(query) do
@@ -40,18 +41,12 @@ defmodule Plausible.Stats.QueryOptimizer do
4041
for sessions.
4142
"""
4243
def split(query) do
43-
{event_metrics, sessions_metrics, _other_metrics} =
44-
query.metrics
45-
|> Util.maybe_add_visitors_metric()
46-
|> TableDecider.partition_metrics(query)
47-
48-
{
49-
Query.set(query,
50-
metrics: event_metrics,
51-
include_imported: query.include_imported
52-
),
53-
split_sessions_query(query, sessions_metrics)
54-
}
44+
query.metrics
45+
|> Util.maybe_add_visitors_metric()
46+
|> TableDecider.partition_metrics(query)
47+
|> Enum.map(fn {table_type, metrics} ->
48+
build_split_query(table_type, metrics, query)
49+
end)
5550
end
5651

5752
defp pipeline() do
@@ -62,7 +57,8 @@ defmodule Plausible.Stats.QueryOptimizer do
6257
&extend_hostname_filters_to_visit/1,
6358
&remove_revenue_metrics_if_unavailable/1,
6459
&set_time_on_page_data/1,
65-
&trim_relative_date_range/1
60+
&trim_relative_date_range/1,
61+
&set_sql_join_type/1
6662
]
6763
end
6864

@@ -162,7 +158,17 @@ defmodule Plausible.Stats.QueryOptimizer do
162158
Enum.find(query.dimensions, &Time.time_dimension?/1)
163159
end
164160

165-
defp split_sessions_query(query, session_metrics) do
161+
defp build_split_query(:events, metrics, query) do
162+
{
163+
:events,
164+
Query.set(query,
165+
metrics: metrics,
166+
include_imported: query.include_imported
167+
)
168+
}
169+
end
170+
171+
defp build_split_query(:sessions, metrics, query) do
166172
dimensions =
167173
query.dimensions
168174
|> Enum.map(fn
@@ -179,12 +185,21 @@ defmodule Plausible.Stats.QueryOptimizer do
179185
query.filters
180186
end
181187

182-
Query.set(query,
183-
filters: filters,
184-
metrics: session_metrics,
185-
dimensions: dimensions,
186-
include_imported: query.include_imported
187-
)
188+
{
189+
:sessions,
190+
Query.set(query,
191+
filters: filters,
192+
metrics: metrics,
193+
dimensions: dimensions,
194+
include_imported: query.include_imported
195+
)
196+
}
197+
end
198+
199+
defp build_split_query(:sessions_smeared, metrics, query) do
200+
{_, query} = build_split_query(:sessions, metrics, query)
201+
202+
{:sessions, Query.set(query, smear_session_metrics: true)}
188203
end
189204

190205
on_ee do
@@ -299,4 +314,19 @@ defmodule Plausible.Stats.QueryOptimizer do
299314
|> DateTimeRange.to_timezone("Etc/UTC")
300315
end
301316
end
317+
318+
# Normally we can always LEFT JOIN as this is more performant and tables
319+
# are expected to contain the same dimensions.
320+
321+
# The only exception is using the "time:minute" dimension where the sessions
322+
# subquery might return more rows than the events one. That's because we're
323+
# counting sessions in all time buckets they were active in even if no event
324+
# occurred during that particular minute.
325+
defp set_sql_join_type(query) do
326+
if "time:minute" in query.dimensions do
327+
Query.set(query, sql_join_type: :full)
328+
else
329+
query
330+
end
331+
end
302332
end

lib/plausible/stats/sql/expression.ex

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ defmodule Plausible.Stats.SQL.Expression do
6464
})
6565
end
6666

67-
def select_dimension(q, key, "time:hour", :sessions, query) do
67+
def select_dimension(q, key, "time:hour", :sessions, query) when query.smear_session_metrics do
6868
# :TRICKY: ClickHouse timeSlots works off of unix epoch and is not
6969
# timezone-aware. This means that for e.g. Asia/Katmandu (GMT+5:45)
7070
# to work, we divide time into 15-minute buckets and later combine these
@@ -87,7 +87,8 @@ defmodule Plausible.Stats.SQL.Expression do
8787
end
8888

8989
# :NOTE: This is not exposed in Query APIv2
90-
def select_dimension(q, key, "time:minute", :sessions, query) do
90+
def select_dimension(q, key, "time:minute", :sessions, query)
91+
when query.smear_session_metrics do
9192
q
9293
|> join(:inner, [s], time_slot in time_slots(query, 60),
9394
as: :time_slot,
@@ -338,6 +339,12 @@ defmodule Plausible.Stats.SQL.Expression do
338339
})
339340
end
340341

342+
def session_metric(:visits, query) when query.smear_session_metrics do
343+
wrap_alias([s], %{
344+
visits: scale_sample(fragment("uniq(?)", s.session_id))
345+
})
346+
end
347+
341348
def session_metric(:visits, _query) do
342349
wrap_alias([s], %{
343350
visits: scale_sample(fragment("greatest(sum(?), 0)", s.sign))

lib/plausible/stats/sql/query_builder.ex

Lines changed: 70 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,14 @@ defmodule Plausible.Stats.SQL.QueryBuilder do
1515
require Plausible.Stats.SQL.Expression
1616

1717
def build(query, site) do
18-
{event_query, sessions_query} = QueryOptimizer.split(query)
19-
20-
event_q = build_events_query(site, event_query)
21-
sessions_q = build_sessions_query(site, sessions_query)
22-
23-
join_query_results(
24-
{event_q, event_query},
25-
{sessions_q, sessions_query}
26-
)
18+
query
19+
|> QueryOptimizer.split()
20+
|> Enum.map(fn {table_type, table_query} ->
21+
q = build_table_query(table_type, site, table_query)
22+
{table_type, table_query, q}
23+
end)
24+
|> join_query_results(query)
25+
|> build_order_by(query)
2726
|> paginate(query.pagination)
2827
|> select_total_rows(query.include.total_rows)
2928
end
@@ -32,9 +31,7 @@ defmodule Plausible.Stats.SQL.QueryBuilder do
3231
Enum.reduce(query.order_by || [], q, &build_order_by(&2, query, &1))
3332
end
3433

35-
defp build_events_query(_site, %Query{metrics: []}), do: nil
36-
37-
defp build_events_query(site, events_query) do
34+
defp build_table_query(:events, site, events_query) do
3835
q =
3936
from(
4037
e in "events_v2",
@@ -54,6 +51,25 @@ defmodule Plausible.Stats.SQL.QueryBuilder do
5451
|> TimeOnPage.merge_legacy_time_on_page(events_query)
5552
end
5653

54+
defp build_table_query(:sessions, site, sessions_query) do
55+
q =
56+
from(
57+
e in "sessions_v2",
58+
where: ^SQL.WhereBuilder.build(:sessions, sessions_query),
59+
select: ^select_session_metrics(sessions_query)
60+
)
61+
62+
on_ee do
63+
q = Plausible.Stats.Sampling.add_query_hint(q, sessions_query)
64+
end
65+
66+
q
67+
|> join_events_if_needed(sessions_query)
68+
|> build_group_by(:sessions, sessions_query)
69+
|> merge_imported(site, sessions_query)
70+
|> SQL.SpecialMetrics.add(site, sessions_query)
71+
end
72+
5773
defp join_sessions_if_needed(q, query) do
5874
if TableDecider.events_join_sessions?(query) do
5975
sessions_q =
@@ -79,27 +95,6 @@ defmodule Plausible.Stats.SQL.QueryBuilder do
7995
end
8096
end
8197

82-
defp build_sessions_query(_site, %Query{metrics: []}), do: nil
83-
84-
defp build_sessions_query(site, sessions_query) do
85-
q =
86-
from(
87-
e in "sessions_v2",
88-
where: ^SQL.WhereBuilder.build(:sessions, sessions_query),
89-
select: ^select_session_metrics(sessions_query)
90-
)
91-
92-
on_ee do
93-
q = Plausible.Stats.Sampling.add_query_hint(q, sessions_query)
94-
end
95-
96-
q
97-
|> join_events_if_needed(sessions_query)
98-
|> build_group_by(:sessions, sessions_query)
99-
|> merge_imported(site, sessions_query)
100-
|> SQL.SpecialMetrics.add(site, sessions_query)
101-
end
102-
10398
def join_events_if_needed(q, query) do
10499
if TableDecider.sessions_join_events?(query) do
105100
events_q =
@@ -173,24 +168,24 @@ defmodule Plausible.Stats.SQL.QueryBuilder do
173168
)
174169
end
175170

176-
defp join_query_results({nil, _}, {nil, _}), do: nil
177-
178-
defp join_query_results({events_q, events_query}, {nil, _}),
179-
do: events_q |> build_order_by(events_query)
171+
# Only one table is being queried - skip joining!
172+
defp join_query_results([{_table_type, _query, q}], _main_query), do: q
180173

181-
defp join_query_results({nil, events_query}, {sessions_q, _}),
182-
do: sessions_q |> build_order_by(events_query)
174+
# Multiple tables: join results based on dimensions, select metrics from each and the appropriate dimensions.
175+
defp join_query_results(queries, main_query) do
176+
queries
177+
|> Enum.reduce(nil, fn
178+
{_table_type, query, q}, nil ->
179+
from(e in subquery(q))
180+
|> select_join_metrics(query, query.metrics)
183181

184-
defp join_query_results({events_q, events_query}, {sessions_q, sessions_query}) do
185-
{join_type, events_q_fields, sessions_q_fields} =
186-
TableDecider.join_options(events_query, sessions_query)
187-
188-
join(subquery(events_q), join_type, [e], s in subquery(sessions_q),
189-
on: ^build_group_by_join(events_query)
190-
)
191-
|> select_join_fields(events_query, events_q_fields, e)
192-
|> select_join_fields(sessions_query, sessions_q_fields, s)
193-
|> build_order_by(events_query)
182+
{_table_type, query, q}, acc ->
183+
join(acc, main_query.sql_join_type, [], s in subquery(q),
184+
on: ^build_group_by_join(main_query)
185+
)
186+
|> select_join_metrics(query, query.metrics -- [:sample_percent])
187+
end)
188+
|> select_dimensions(main_query)
194189
end
195190

196191
# NOTE: Old queries do their own pagination
@@ -214,8 +209,33 @@ defmodule Plausible.Stats.SQL.QueryBuilder do
214209
def build_group_by_join(query) do
215210
query.dimensions
216211
|> Enum.map(fn dim ->
217-
dynamic([e, s], field(e, ^shortname(query, dim)) == field(s, ^shortname(query, dim)))
212+
dynamic([a, ..., b], field(a, ^shortname(query, dim)) == field(b, ^shortname(query, dim)))
218213
end)
219214
|> Enum.reduce(fn condition, acc -> dynamic([], ^acc and ^condition) end)
220215
end
216+
217+
defp select_join_metrics(q, query, metrics) do
218+
Enum.reduce(metrics, q, fn
219+
metric, q ->
220+
select_merge_as(q, [..., x], %{
221+
shortname(query, metric) => field(x, ^shortname(query, metric))
222+
})
223+
end)
224+
end
225+
226+
defp select_dimensions(q, query) do
227+
Enum.reduce(query.dimensions, q, fn dimension, q ->
228+
# We generally select dimensions from the left-most table. Only exception is time:minute where
229+
# we use sessions table as sessions are considered on-going during the whole period.
230+
if query.sql_join_type == :full and "time:minute" == dimension do
231+
select_merge_as(q, [..., x], %{
232+
shortname(query, dimension) => field(x, ^shortname(query, dimension))
233+
})
234+
else
235+
select_merge_as(q, [x], %{
236+
shortname(query, dimension) => field(x, ^shortname(query, dimension))
237+
})
238+
end
239+
end)
240+
end
221241
end

0 commit comments

Comments
 (0)