Delete unreferenced state groups

Fix purging older state
Newsfile
2025-12-15 02:00:21 +00:00 · 2025-01-26 14:36:00 +00:00 · 2025-01-26 14:10:13 +00:00 · 2025-01-24 16:27:45 +00:00 · 2025-01-24 16:27:45 +00:00
15 changed files with 635 additions and 46 deletions
--- a/changelog.d/18107.bugfix
+++ b/changelog.d/18107.bugfix
@@ -0,0 +1 @@
+Fix rare edge case where state groups could be deleted while we are persisting new events that reference them.
--- a/synapse/handlers/federation_event.py
+++ b/synapse/handlers/federation_event.py
@@ -151,6 +151,8 @@ class FederationEventHandler:
    def __init__(self, hs: "HomeServer"):
        self._clock = hs.get_clock()
        self._store = hs.get_datastores().main
+        self._state_store = hs.get_datastores().state
+        self._state_epoch_store = hs.get_datastores().state_epochs
        self._storage_controllers = hs.get_storage_controllers()
        self._state_storage_controller = self._storage_controllers.state

@@ -580,7 +582,9 @@ class FederationEventHandler:
                        room_version.identifier,
                        state_maps_to_resolve,
                        event_map=None,
-                        state_res_store=StateResolutionStore(self._store),
+                        state_res_store=StateResolutionStore(
+                            self._store, self._state_epoch_store
+                        ),
                    )
                )
            else:
@@ -1179,7 +1183,9 @@ class FederationEventHandler:
                room_version,
                state_maps,
                event_map={event_id: event},
-                state_res_store=StateResolutionStore(self._store),
+                state_res_store=StateResolutionStore(
+                    self._store, self._state_epoch_store
+                ),
            )

        except Exception as e:
@@ -1874,7 +1880,9 @@ class FederationEventHandler:
                    room_version,
                    [local_state_id_map, claimed_auth_events_id_map],
                    event_map=None,
-                    state_res_store=StateResolutionStore(self._store),
+                    state_res_store=StateResolutionStore(
+                        self._store, self._state_epoch_store
+                    ),
                )
            )
        else:
@@ -2014,7 +2022,9 @@ class FederationEventHandler:
                    room_version,
                    state_sets,
                    event_map=None,
-                    state_res_store=StateResolutionStore(self._store),
+                    state_res_store=StateResolutionStore(
+                        self._store, self._state_epoch_store
+                    ),
                )
            )
        else:
--- a/synapse/state/init.py
+++ b/synapse/state/init.py
@@ -59,11 +59,13 @@ from synapse.types.state import StateFilter
 from synapse.util.async_helpers import Linearizer
 from synapse.util.caches.expiringcache import ExpiringCache
 from synapse.util.metrics import Measure, measure_func
+from synapse.util.stringutils import shortstr

 if TYPE_CHECKING:
    from synapse.server import HomeServer
    from synapse.storage.controllers import StateStorageController
    from synapse.storage.databases.main import DataStore
+    from synapse.storage.databases.state.epochs import StateEpochDataStore

 logger = logging.getLogger(__name__)
 metrics_logger = logging.getLogger("synapse.state.metrics")
@@ -194,6 +196,8 @@ class StateHandler:
        self._storage_controllers = hs.get_storage_controllers()
        self._events_shard_config = hs.config.worker.events_shard_config
        self._instance_name = hs.get_instance_name()
+        self._state_store = hs.get_datastores().state
+        self._state_epoch_store = hs.get_datastores().state_epochs

        self._update_current_state_client = (
            ReplicationUpdateCurrentStateRestServlet.make_client(hs)
@@ -475,7 +479,10 @@ class StateHandler:
    @trace
    @measure_func()
    async def resolve_state_groups_for_events(
-        self, room_id: str, event_ids: StrCollection, await_full_state: bool = True
+        self,
+        room_id: str,
+        event_ids: StrCollection,
+        await_full_state: bool = True,
    ) -> _StateCacheEntry:
        """Given a list of event_ids this method fetches the state at each
        event, resolves conflicts between them and returns them.
@@ -511,6 +518,19 @@ class StateHandler:
            ) = await self._state_storage_controller.get_state_group_delta(
                state_group_id
            )
+
+            if prev_group:
+                # Ensure that we still have the prev group, and ensure we don't
+                # delete it while we're persisting the event.
+                missing_state_group = (
+                    await self._state_epoch_store.check_state_groups_and_bump_deletion(
+                        {prev_group}
+                    )
+                )
+                if missing_state_group:
+                    prev_group = None
+                    delta_ids = None
+
            return _StateCacheEntry(
                state=None,
                state_group=state_group_id,
@@ -531,7 +551,7 @@ class StateHandler:
            room_version,
            state_to_resolve,
            None,
-            state_res_store=StateResolutionStore(self.store),
+            state_res_store=StateResolutionStore(self.store, self._state_epoch_store),
        )
        return result

@@ -663,7 +683,25 @@ class StateResolutionHandler:
        async with self.resolve_linearizer.queue(group_names):
            cache = self._state_cache.get(group_names, None)
            if cache:
-                return cache
+                # Check that the returned cache entry doesn't point to deleted
+                # state groups.
+                state_groups_to_check = set()
+                if cache.state_group is not None:
+                    state_groups_to_check.add(cache.state_group)
+
+                if cache.prev_group is not None:
+                    state_groups_to_check.add(cache.prev_group)
+
+                missing_state_groups = await state_res_store.state_epoch_store.check_state_groups_and_bump_deletion(
+                    state_groups_to_check
+                )
+
+                if not missing_state_groups:
+                    return cache
+                else:
+                    # There are missing state groups, so let's remove the stale
+                    # entry and continue as if it was a cache miss.
+                    self._state_cache.pop(group_names, None)

            logger.info(
                "Resolving state for %s with groups %s",
@@ -671,6 +709,16 @@ class StateResolutionHandler:
                list(group_names),
            )

+            # We double check that none of the state groups have been deleted.
+            # They shouldn't be as all these state groups should be referenced.
+            missing_state_groups = await state_res_store.state_epoch_store.check_state_groups_and_bump_deletion(
+                group_names
+            )
+            if missing_state_groups:
+                raise Exception(
+                    f"State groups have been deleted: {shortstr(missing_state_groups)}"
+                )
+
            state_groups_histogram.observe(len(state_groups_ids))

            new_state = await self.resolve_events_with_store(
@@ -884,7 +932,8 @@ class StateResolutionStore:
    in well defined way.
    """

-    store: "DataStore"
+    main_store: "DataStore"
+    state_epoch_store: "StateEpochDataStore"

    def get_events(
        self, event_ids: StrCollection, allow_rejected: bool = False
@@ -899,7 +948,7 @@ class StateResolutionStore:
            An awaitable which resolves to a dict from event_id to event.
        """

-        return self.store.get_events(
+        return self.main_store.get_events(
            event_ids,
            redact_behaviour=EventRedactBehaviour.as_is,
            get_prev_content=False,
@@ -920,4 +969,4 @@ class StateResolutionStore:
            An awaitable that resolves to a set of event IDs.
        """

-        return self.store.get_auth_chain_difference(room_id, state_sets)
+        return self.main_store.get_auth_chain_difference(room_id, state_sets)
--- a/synapse/storage/controllers/persist_events.py
+++ b/synapse/storage/controllers/persist_events.py
@@ -332,6 +332,7 @@ class EventsPersistenceStorageController:
        # store for now.
        self.main_store = stores.main
        self.state_store = stores.state
+        self._state_epoch_store = stores.state_epochs

        assert stores.persist_events
        self.persist_events_store = stores.persist_events
@@ -549,7 +550,9 @@ class EventsPersistenceStorageController:
            room_version,
            state_maps_by_state_group,
            event_map=None,
-            state_res_store=StateResolutionStore(self.main_store),
+            state_res_store=StateResolutionStore(
+                self.main_store, self._state_epoch_store
+            ),
        )

        return await res.get_state(self._state_controller, StateFilter.all())
@@ -635,15 +638,20 @@ class EventsPersistenceStorageController:
                    room_id, [e for e, _ in chunk]
                )

-            await self.persist_events_store._persist_events_and_state_updates(
-                room_id,
-                chunk,
-                state_delta_for_room=state_delta_for_room,
-                new_forward_extremities=new_forward_extremities,
-                use_negative_stream_ordering=backfilled,
-                inhibit_local_membership_updates=backfilled,
-                new_event_links=new_event_links,
-            )
+            # Stop the state groups from being deleted while we're persisting
+            # them.
+            async with self._state_epoch_store.persisting_state_group_references(
+                events_and_contexts
+            ):
+                await self.persist_events_store._persist_events_and_state_updates(
+                    room_id,
+                    chunk,
+                    state_delta_for_room=state_delta_for_room,
+                    new_forward_extremities=new_forward_extremities,
+                    use_negative_stream_ordering=backfilled,
+                    inhibit_local_membership_updates=backfilled,
+                    new_event_links=new_event_links,
+                )

        return replaced_events

@@ -965,7 +973,9 @@ class EventsPersistenceStorageController:
            room_version,
            state_groups,
            events_map,
-            state_res_store=StateResolutionStore(self.main_store),
+            state_res_store=StateResolutionStore(
+                self.main_store, self._state_epoch_store
+            ),
        )

        state_resolutions_during_persistence.inc()
--- a/synapse/storage/controllers/purge_events.py
+++ b/synapse/storage/controllers/purge_events.py
@@ -118,6 +118,16 @@ class PurgeEventsStorageController:
            next_to_search |= prevs
            state_groups_seen |= prevs

+            # We also check to see if anything referencing the state groups are
+            # also unreferenced. This helps ensure that we delete unreferenced
+            # state groups, if we don't then we will de-delta them when we
+            # delete the other state groups leading to increased DB usage.
+            next_edges = await self.stores.state.get_next_state_groups(current_search)
+            nexts = set(next_edges.keys())
+            nexts -= state_groups_seen
+            next_to_search |= nexts
+            state_groups_seen |= nexts
+
        to_delete = state_groups_seen - referenced_groups

        return to_delete
--- a/synapse/storage/databases/init.py
+++ b/synapse/storage/databases/init.py
@@ -26,6 +26,7 @@ from synapse.storage._base import SQLBaseStore
 from synapse.storage.database import DatabasePool, make_conn
 from synapse.storage.databases.main.events import PersistEventsStore
 from synapse.storage.databases.state import StateGroupDataStore
+from synapse.storage.databases.state.epochs import StateEpochDataStore
 from synapse.storage.engines import create_engine
 from synapse.storage.prepare_database import prepare_database

@@ -49,12 +50,14 @@ class Databases(Generic[DataStoreT]):
        main
        state
        persist_events
+        state_epochs
    """

    databases: List[DatabasePool]
    main: "DataStore"  # FIXME: https://github.com/matrix-org/synapse/issues/11165: actually an instance of `main_store_class`
    state: StateGroupDataStore
    persist_events: Optional[PersistEventsStore]
+    state_epochs: StateEpochDataStore

    def __init__(self, main_store_class: Type[DataStoreT], hs: "HomeServer"):
        # Note we pass in the main store class here as workers use a different main
@@ -63,6 +66,7 @@ class Databases(Generic[DataStoreT]):
        self.databases = []
        main: Optional[DataStoreT] = None
        state: Optional[StateGroupDataStore] = None
+        state_epochs: Optional[StateEpochDataStore] = None
        persist_events: Optional[PersistEventsStore] = None

        for database_config in hs.config.database.databases:
@@ -114,7 +118,8 @@ class Databases(Generic[DataStoreT]):
                    if state:
                        raise Exception("'state' data store already configured")

-                    state = StateGroupDataStore(database, db_conn, hs)
+                    state_epochs = StateEpochDataStore(database, db_conn, hs)
+                    state = StateGroupDataStore(database, db_conn, hs, state_epochs)

                db_conn.commit()

@@ -135,7 +140,7 @@ class Databases(Generic[DataStoreT]):
        if not main:
            raise Exception("No 'main' database configured")

-        if not state:
+        if not state or not state_epochs:
            raise Exception("No 'state' database configured")

        # We use local variables here to ensure that the databases do not have
@@ -143,3 +148,4 @@ class Databases(Generic[DataStoreT]):
        self.main = main  # type: ignore[assignment]
        self.state = state
        self.persist_events = persist_events
+        self.state_epochs = state_epochs
--- a/synapse/storage/databases/state/epochs.py
+++ b/synapse/storage/databases/state/epochs.py
@@ -0,0 +1,304 @@
+#
+# This file is licensed under the Affero General Public License (AGPL) version 3.
+#
+# Copyright (C) 2025 New Vector, Ltd
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# See the GNU Affero General Public License for more details:
+# <https://www.gnu.org/licenses/agpl-3.0.html>.
+#
+
+
+import contextlib
+from typing import (
+    TYPE_CHECKING,
+    AbstractSet,
+    AsyncIterator,
+    Collection,
+    Set,
+    Tuple,
+)
+
+from synapse.events import EventBase
+from synapse.events.snapshot import EventContext
+from synapse.metrics.background_process_metrics import wrap_as_background_process
+from synapse.storage.database import (
+    DatabasePool,
+    LoggingDatabaseConnection,
+    LoggingTransaction,
+    make_in_list_sql_clause,
+)
+from synapse.storage.engines import PostgresEngine
+from synapse.util.stringutils import shortstr
+
+if TYPE_CHECKING:
+    from synapse.server import HomeServer
+
+
+class StateEpochDataStore:
+    """Manages state epochs and checks for state group deletion.
+
+    Deleting state groups is challenging as before we actually delete them we
+    need to ensure that there are no in-flight events that refer to the state
+    groups that we want to delete.
+
+    To handle this, we take two approaches. First, before we persist any event
+    we ensure that the state groups still exist and mark in the
+    `state_groups_persisting` table that the state group is about to be used.
+    (Note that we have to have the extra table here as state groups and events
+    can be in different databases, and thus we can't check for the existence of
+    state groups in the persist event transaction). Once the event has been
+    persisted, we can remove the row from  `state_groups_persisting`. So long as
+    we check that table before deleting state groups, we can ensure that we
+    never persist events that reference deleted state groups, maintaining
+    database integrity.
+
+    However, we want to avoid throwing exceptions so deep in the process of
+    persisting events. So we use a concept of `state_epochs`, where we mark
+    state groups as pending/proposed for deletion and wait for a certain number
+    epoch increments before performing the deletion. When we come to handle new
+    events that reference state groups, we check if they are pending deletion
+    and bump the epoch when they'll be deleted in (to give a chance for the
+    event to be persisted, or not).
+    """
+
+    # How frequently, roughly, to increment epochs.
+    TIME_BETWEEN_EPOCH_INCREMENTS_MS = 5 * 60 * 1000
+
+    # The number of epoch increases that must have happened between marking a
+    # state group as pending and actually deleting it.
+    NUMBER_EPOCHS_BEFORE_DELETION = 3
+
+    def __init__(
+        self,
+        database: DatabasePool,
+        db_conn: LoggingDatabaseConnection,
+        hs: "HomeServer",
+    ):
+        self._clock = hs.get_clock()
+        self.db_pool = database
+        self._instance_name = hs.get_instance_name()
+
+        # TODO: Clear from `state_groups_persisting` any holdovers from previous
+        # running instance.
+
+        if hs.config.worker.run_background_tasks:
+            # Add a background loop to periodically check if we should bump
+            # state epoch.
+            self._clock.looping_call_now(
+                self._advance_state_epoch, self.TIME_BETWEEN_EPOCH_INCREMENTS_MS / 5
+            )
+
+    @wrap_as_background_process("_advance_state_epoch")
+    async def _advance_state_epoch(self) -> None:
+        """Advances the state epoch, checking that we haven't advanced it too
+        recently.
+        """
+
+        now = self._clock.time_msec()
+        update_if_before_ts = now - self.TIME_BETWEEN_EPOCH_INCREMENTS_MS
+
+        def advance_state_epoch_txn(txn: LoggingTransaction) -> None:
+            sql = """
+                UPDATE state_epoch
+                SET state_epoch = state_epoch + 1, updated_ts = ?
+                WHERE updated_ts <= ?
+            """
+            txn.execute(sql, (now, update_if_before_ts))
+
+        await self.db_pool.runInteraction(
+            "_advance_state_epoch", advance_state_epoch_txn, db_autocommit=True
+        )
+
+    async def check_state_groups_and_bump_deletion(
+        self, state_groups: AbstractSet[int]
+    ) -> Collection[int]:
+        """Checks to make sure that the state groups haven't been deleted, and
+        if they're pending deletion we delay it (allowing time for any event
+        that will use them to finish persisting).
+
+        Returns:
+            The state groups that are missing, if any.
+        """
+
+        return await self.db_pool.runInteraction(
+            "check_state_groups_and_bump_deletion",
+            self._check_state_groups_and_bump_deletion_txn,
+            state_groups,
+        )
+
+    def _check_state_groups_and_bump_deletion_txn(
+        self, txn: LoggingTransaction, state_groups: AbstractSet[int]
+    ) -> Collection[int]:
+        existing_state_groups = self._get_existing_groups_with_lock(txn, state_groups)
+        if state_groups - existing_state_groups:
+            return state_groups - existing_state_groups
+
+        clause, args = make_in_list_sql_clause(
+            self.db_pool.engine, "state_group", state_groups
+        )
+        sql = f"""
+            UPDATE state_groups_pending_deletion
+            SET state_epoch = (SELECT state_epoch FROM state_epoch)
+            WHERE {clause}
+        """
+
+        txn.execute(sql, args)
+
+        return ()
+
+    def _get_existing_groups_with_lock(
+        self, txn: LoggingTransaction, state_groups: Collection[int]
+    ) -> AbstractSet[int]:
+        """Return which of the given state groups are in the database, and locks
+        those rows with `KEY SHARE` to ensure they don't get concurrently
+        deleted."""
+        clause, args = make_in_list_sql_clause(self.db_pool.engine, "id", state_groups)
+
+        sql = f"""
+            SELECT id FROM state_groups
+            WHERE {clause}
+        """
+        if isinstance(self.db_pool.engine, PostgresEngine):
+            # On postgres we add a row level lock to the rows to ensure that we
+            # conflict with any concurrent DELETEs. `FOR KEY SHARE` lock will
+            # not conflict with other read
+            sql += """
+            FOR KEY SHARE
+            """
+
+        txn.execute(sql, args)
+        return {state_group for (state_group,) in txn}
+
+    @contextlib.asynccontextmanager
+    async def persisting_state_group_references(
+        self, event_and_contexts: Collection[Tuple[EventBase, EventContext]]
+    ) -> AsyncIterator[None]:
+        """Wraps the persistence of the given events and contexts, ensuring that
+        any state groups referenced still exist and that they don't get deleted
+        during this."""
+
+        referenced_state_groups: Set[int] = set()
+        for event, ctx in event_and_contexts:
+            if ctx.rejected or event.internal_metadata.is_outlier():
+                continue
+
+            assert ctx.state_group is not None
+
+            referenced_state_groups.add(ctx.state_group)
+
+            if ctx.state_group_before_event:
+                referenced_state_groups.add(ctx.state_group_before_event)
+
+        if not referenced_state_groups:
+            # We don't reference any state groups, so nothing to do
+            yield
+            return
+
+        await self.db_pool.runInteraction(
+            "mark_state_groups_as_used",
+            self._mark_state_groups_as_used_txn,
+            referenced_state_groups,
+        )
+
+        try:
+            yield None
+        finally:
+            await self.db_pool.simple_delete_many(
+                table="state_groups_persisting",
+                column="state_group",
+                iterable=referenced_state_groups,
+                keyvalues={"instance_name": self._instance_name},
+                desc="persisting_state_group_references_delete",
+            )
+
+    def _mark_state_groups_as_used_txn(
+        self, txn: LoggingTransaction, state_groups: Set[int]
+    ) -> None:
+        """Marks the given state groups as used. Also checks that the given
+        state epoch is not too old."""
+
+        existing_state_groups = self._get_existing_groups_with_lock(txn, state_groups)
+        missing_state_groups = state_groups - existing_state_groups
+        if missing_state_groups:
+            raise Exception(
+                f"state groups have been deleted: {shortstr(missing_state_groups)}"
+            )
+
+        self.db_pool.simple_delete_many_batch_txn(
+            txn,
+            table="state_groups_pending_deletion",
+            keys=("state_group",),
+            values=[(state_group,) for state_group in state_groups],
+        )
+
+        self.db_pool.simple_insert_many_txn(
+            txn,
+            table="state_groups_persisting",
+            keys=("state_group", "instance_name"),
+            values=[(state_group, self._instance_name) for state_group in state_groups],
+        )
+
+    def get_state_groups_that_can_be_purged_txn(
+        self, txn: LoggingTransaction, state_groups: Collection[int]
+    ) -> Collection[int]:
+        """Given a set of state groups, return which state groups can be deleted."""
+
+        if not state_groups:
+            return state_groups
+
+        if isinstance(self.db_pool.engine, PostgresEngine):
+            # On postgres we want to lock the rows FOR UPDATE as early as
+            # possible to help conflicts.
+            clause, args = make_in_list_sql_clause(
+                self.db_pool.engine, "id", state_groups
+            )
+            sql = """
+                SELECT id FROM state_groups
+                WHERE {clause}
+                FOR UPDATE
+            """
+            txn.execute(sql, args)
+
+        current_state_epoch = self.db_pool.simple_select_one_onecol_txn(
+            txn,
+            table="state_epoch",
+            retcol="state_epoch",
+            keyvalues={},
+        )
+
+        # Check the deletion status in the DB of the given state groups
+        clause, args = make_in_list_sql_clause(
+            self.db_pool.engine, column="state_group", iterable=state_groups
+        )
+
+        sql = f"""
+            SELECT state_group, state_epoch FROM (
+                SELECT state_group, state_epoch FROM state_groups_pending_deletion
+                UNION
+                SELECT state_group, null FROM state_groups_persisting
+            ) AS s
+            WHERE {clause}
+        """
+
+        txn.execute(sql, args)
+
+        can_delete = set()
+        for state_group, state_epoch in txn:
+            if state_epoch is None:
+                # A null state epoch means that we are currently persisting
+                # events that reference the state group, so we don't delete
+                # them.
+                continue
+
+            if current_state_epoch - state_epoch < self.NUMBER_EPOCHS_BEFORE_DELETION:
+                # Not enough state epochs have occurred to allow us to delete.
+                continue
+
+            can_delete.add(state_group)
+
+        return can_delete
--- a/synapse/storage/databases/state/store.py
+++ b/synapse/storage/databases/state/store.py
@@ -36,7 +36,10 @@ import attr

 from synapse.api.constants import EventTypes
 from synapse.events import EventBase
-from synapse.events.snapshot import UnpersistedEventContext, UnpersistedEventContextBase
+from synapse.events.snapshot import (
+    UnpersistedEventContext,
+    UnpersistedEventContextBase,
+)
 from synapse.logging.opentracing import tag_args, trace
 from synapse.storage._base import SQLBaseStore
 from synapse.storage.database import (
@@ -55,6 +58,7 @@ from synapse.util.cancellation import cancellable

 if TYPE_CHECKING:
    from synapse.server import HomeServer
+    from synapse.storage.databases.state.epochs import StateEpochDataStore

 logger = logging.getLogger(__name__)

@@ -83,8 +87,10 @@ class StateGroupDataStore(StateBackgroundUpdateStore, SQLBaseStore):
        database: DatabasePool,
        db_conn: LoggingDatabaseConnection,
        hs: "HomeServer",
+        epoch_store: "StateEpochDataStore",
    ):
        super().__init__(database, db_conn, hs)
+        self._epoch_store = epoch_store

        # Originally the state store used a single DictionaryCache to cache the
        # event IDs for the state types in a given state group to avoid hammering
@@ -467,14 +473,13 @@ class StateGroupDataStore(StateBackgroundUpdateStore, SQLBaseStore):
            Returns:
                A list of state groups
            """
-            is_in_db = self.db_pool.simple_select_one_onecol_txn(
+
+            # We need to check that the prev group isn't about to be deleted
+            is_missing = self._epoch_store._check_state_groups_and_bump_deletion_txn(
                txn,
-                table="state_groups",
-                keyvalues={"id": prev_group},
-                retcol="id",
-                allow_none=True,
+                {prev_group},
            )
-            if not is_in_db:
+            if is_missing:
                raise Exception(
                    "Trying to persist state with unpersisted prev_group: %r"
                    % (prev_group,)
@@ -546,6 +551,7 @@ class StateGroupDataStore(StateBackgroundUpdateStore, SQLBaseStore):
                    for key, state_id in context.state_delta_due_to_event.items()
                ],
            )
+
            return events_and_context

        return await self.db_pool.runInteraction(
@@ -601,14 +607,13 @@ class StateGroupDataStore(StateBackgroundUpdateStore, SQLBaseStore):
                The state group if successfully created, or None if the state
                needs to be persisted as a full state.
            """
-            is_in_db = self.db_pool.simple_select_one_onecol_txn(
+
+            # We need to check that the prev group isn't about to be deleted
+            is_missing = self._epoch_store._check_state_groups_and_bump_deletion_txn(
                txn,
-                table="state_groups",
-                keyvalues={"id": prev_group},
-                retcol="id",
-                allow_none=True,
+                {prev_group},
            )
-            if not is_in_db:
+            if is_missing:
                raise Exception(
                    "Trying to persist state with unpersisted prev_group: %r"
                    % (prev_group,)
@@ -830,7 +835,7 @@ class StateGroupDataStore(StateBackgroundUpdateStore, SQLBaseStore):
            List[Tuple[int, int]],
            await self.db_pool.simple_select_many_batch(
                table="state_group_edges",
-                column="prev_state_group",
+                column="state_group",
                iterable=state_groups,
                keyvalues={},
                retcols=("state_group", "prev_state_group"),
@@ -840,6 +845,35 @@ class StateGroupDataStore(StateBackgroundUpdateStore, SQLBaseStore):

        return dict(rows)

+    @trace
+    @tag_args
+    async def get_next_state_groups(
+        self, state_groups: Iterable[int]
+    ) -> Dict[int, int]:
+        """Fetch the groups that have the given state groups as their previous
+        state groups.
+
+        Args:
+            state_groups
+
+        Returns:
+            A mapping from state group to previous state group.
+        """
+
+        rows = cast(
+            List[Tuple[int, int]],
+            await self.db_pool.simple_select_many_batch(
+                table="state_group_edges",
+                column="prev_state_group",
+                iterable=state_groups,
+                keyvalues={},
+                retcols=("state_group", "prev_state_group"),
+                desc="get_next_state_groups",
+            ),
+        )
+
+        return dict(rows)
+
    async def purge_room_state(self, room_id: str) -> None:
        return await self.db_pool.runInteraction(
            "purge_room_state",
--- a/synapse/storage/schema/init.py
+++ b/synapse/storage/schema/init.py
@@ -19,7 +19,7 @@
 #
 #

-SCHEMA_VERSION = 88  # remember to update the list below when updating
+SCHEMA_VERSION = 89  # remember to update the list below when updating
 """Represents the expectations made by the codebase about the database schema

 This should be incremented whenever the codebase changes its requirements on the
--- a/synapse/storage/schema/state/delta/89/01_state_groups_epochs.sql
+++ b/synapse/storage/schema/state/delta/89/01_state_groups_epochs.sql
@@ -0,0 +1,47 @@
+--
+-- This file is licensed under the Affero General Public License (AGPL) version 3.
+--
+-- Copyright (C) 2025 New Vector, Ltd
+--
+-- This program is free software: you can redistribute it and/or modify
+-- it under the terms of the GNU Affero General Public License as
+-- published by the Free Software Foundation, either version 3 of the
+-- License, or (at your option) any later version.
+--
+-- See the GNU Affero General Public License for more details:
+-- <https://www.gnu.org/licenses/agpl-3.0.html>.
+
+-- See the `StateEpochDataStore` for details of these tables.
+
+-- Holds the current state epoch
+CREATE TABLE IF NOT EXISTS state_epoch (
+    Lock CHAR(1) NOT NULL DEFAULT 'X' UNIQUE,  -- Makes sure this table only has one row.
+    state_epoch BIGINT NOT NULL,
+    updated_ts BIGINT NOT NULL,
+    CHECK (Lock='X')
+);
+
+-- Insert a row so that we always have one row in the table. This will get
+-- updated when Synapse starts.
+INSERT INTO state_epoch (state_epoch, updated_ts) VALUES (0, 0);
+
+
+-- We add state groups to this table when we want to later delete them. The
+-- `state_epoch` column indicates when the state group was inserted.
+CREATE TABLE IF NOT EXISTS state_groups_pending_deletion (
+    state_group BIGINT NOT NULL,
+    state_epoch BIGINT NOT NULL,
+    PRIMARY KEY (state_group, state_epoch)
+);
+
+CREATE INDEX state_groups_pending_deletion_epoch ON state_groups_pending_deletion(state_epoch);
+
+
+-- Holds the state groups the worker is currently persisting.
+CREATE TABLE IF NOT EXISTS state_groups_persisting (
+    state_group BIGINT NOT NULL,
+    instance_name TEXT NOT NULL,
+    PRIMARY KEY (state_group, instance_name)
+);
+
+CREATE INDEX state_groups_persisting_instance_name ON state_groups_persisting(instance_name);
--- a/tests/handlers/test_federation_event.py
+++ b/tests/handlers/test_federation_event.py
@@ -807,6 +807,7 @@ class FederationEventHandlerTests(unittest.FederatingHomeserverTestCase):

        OTHER_USER = f"@user:{self.OTHER_SERVER_NAME}"
        main_store = self.hs.get_datastores().main
+        epoch_store = self.hs.get_datastores().state_epochs

        # Create the room.
        kermit_user_id = self.register_user("kermit", "test")
@@ -958,7 +959,7 @@ class FederationEventHandlerTests(unittest.FederatingHomeserverTestCase):
                        bert_member_event.event_id: bert_member_event,
                        rejected_kick_event.event_id: rejected_kick_event,
                    },
-                    state_res_store=StateResolutionStore(main_store),
+                    state_res_store=StateResolutionStore(main_store, epoch_store),
                )
            ),
            [bert_member_event.event_id, rejected_kick_event.event_id],
@@ -1003,7 +1004,7 @@ class FederationEventHandlerTests(unittest.FederatingHomeserverTestCase):
                        rejected_power_levels_event.event_id,
                    ],
                    event_map={},
-                    state_res_store=StateResolutionStore(main_store),
+                    state_res_store=StateResolutionStore(main_store, epoch_store),
                    full_conflicted_set=set(),
                )
            ),
--- a/tests/rest/client/test_rooms.py
+++ b/tests/rest/client/test_rooms.py
@@ -742,7 +742,7 @@ class RoomsCreateTestCase(RoomBase):
        self.assertEqual(HTTPStatus.OK, channel.code, channel.result)
        self.assertTrue("room_id" in channel.json_body)
        assert channel.resource_usage is not None
-        self.assertEqual(33, channel.resource_usage.db_txn_count)
+        self.assertEqual(35, channel.resource_usage.db_txn_count)

    def test_post_room_initial_state(self) -> None:
        # POST with initial_state config key, expect new room id
@@ -755,7 +755,7 @@ class RoomsCreateTestCase(RoomBase):
        self.assertEqual(HTTPStatus.OK, channel.code, channel.result)
        self.assertTrue("room_id" in channel.json_body)
        assert channel.resource_usage is not None
-        self.assertEqual(35, channel.resource_usage.db_txn_count)
+        self.assertEqual(37, channel.resource_usage.db_txn_count)

    def test_post_room_visibility_key(self) -> None:
        # POST with visibility config key, expect new room id
--- a/tests/rest/client/utils.py
+++ b/tests/rest/client/utils.py
@@ -548,7 +548,7 @@ class RestHelper:
        room_id: str,
        event_type: str,
        body: Dict[str, Any],
-        tok: Optional[str],
+        tok: Optional[str] = None,
        expect_code: int = HTTPStatus.OK,
        state_key: str = "",
    ) -> JsonDict:
--- a/tests/storage/test_purge.py
+++ b/tests/storage/test_purge.py
@@ -40,6 +40,7 @@ class PurgeTests(HomeserverTestCase):
        self.room_id = self.helper.create_room_as(self.user_id)

        self.store = hs.get_datastores().main
+        self.state_store = hs.get_datastores().state
        self._storage_controllers = self.hs.get_storage_controllers()

    def test_purge_history(self) -> None:
@@ -128,3 +129,107 @@ class PurgeTests(HomeserverTestCase):
        self.store._invalidate_local_get_event_cache(create_event.event_id)
        self.get_failure(self.store.get_event(create_event.event_id), NotFoundError)
        self.get_failure(self.store.get_event(first["event_id"]), NotFoundError)
+
+    def test_purge_state_groups(self) -> None:
+        """Test that when purging we delete the relevant state groups"""
+
+        self.helper.send(self.room_id, body="test1")
+        self.helper.send_state(self.room_id, "org.matrix.test", body={"number": 2})
+        self.helper.send_state(self.room_id, "org.matrix.test", body={"number": 3})
+        self.helper.send(self.room_id, body="test4")
+        last = self.helper.send(self.room_id, body="test5")
+
+        # Get the topological token
+        token = self.get_success(
+            self.store.get_topological_token_for_event(last["event_id"])
+        )
+        token_str = self.get_success(token.to_string(self.hs.get_datastores().main))
+
+        # Purge everything before this topological token
+        self.get_success(
+            self._storage_controllers.purge_events.purge_history(
+                self.room_id, token_str, True
+            )
+        )
+
+        # We expect there to now only be one state group for the room, which is
+        # the state group of the last event (as the only outlier).
+        state_groups = self.get_success(
+            self.state_store.db_pool.simple_select_onecol(
+                table="state_groups",
+                keyvalues={"room_id": self.room_id},
+                retcol="id",
+                desc="test_purge_state_groups",
+            )
+        )
+        self.assertEqual(len(state_groups), 1)
+
+    def test_purge_unreferenced_state_group(self) -> None:
+        """Test that purging a room also gets rid of unreferenced state groups
+        it encounters during the purge.
+
+        This is important, as otherwise these unreferenced state groups get
+        "de-deltaed" during the purge process, consuming lots of disk space.
+        """
+
+        self.helper.send(self.room_id, body="test1")
+        state1 = self.helper.send_state(
+            self.room_id, "org.matrix.test", body={"number": 2}
+        )
+        state2 = self.helper.send_state(
+            self.room_id, "org.matrix.test", body={"number": 3}
+        )
+        self.helper.send(self.room_id, body="test4")
+        last = self.helper.send(self.room_id, body="test5")
+
+        # Create an unreferenced state group that has a prev group of one of the
+        # to-be-purged events.
+        prev_group = self.get_success(
+            self.store._get_state_group_for_event(state1["event_id"])
+        )
+        unreferenced_state_group = self.get_success(
+            self.state_store.store_state_group(
+                event_id=last["event_id"],
+                room_id=self.room_id,
+                prev_group=prev_group,
+                delta_ids={("org.matrix.test", ""): state2["event_id"]},
+                current_state_ids=None,
+            )
+        )
+
+        # Get the topological token
+        token = self.get_success(
+            self.store.get_topological_token_for_event(last["event_id"])
+        )
+        token_str = self.get_success(token.to_string(self.hs.get_datastores().main))
+
+        # Purge everything before this topological token
+        self.get_success(
+            self._storage_controllers.purge_events.purge_history(
+                self.room_id, token_str, True
+            )
+        )
+
+        # We expect that the unreferenced state group has been deleted.
+        row = self.get_success(
+            self.state_store.db_pool.simple_select_one_onecol(
+                table="state_groups",
+                keyvalues={"id": unreferenced_state_group},
+                retcol="id",
+                allow_none=True,
+                desc="test_purge_unreferenced_state_group",
+            )
+        )
+        self.assertIsNone(row)
+
+        # We expect there to now only be one state group for the room, which is
+        # the state group of the last event (as the only outlier).
+        state_groups = self.get_success(
+            self.state_store.db_pool.simple_select_onecol(
+                table="state_groups",
+                keyvalues={"room_id": self.room_id},
+                retcol="id",
+                desc="test_purge_unreferenced_state_group",
+            )
+        )
+        self.assertEqual(len(state_groups), 1)
--- a/tests/test_state.py
+++ b/tests/test_state.py
@@ -31,7 +31,7 @@ from typing import (
    Tuple,
    cast,
 )
-from unittest.mock import Mock
+from unittest.mock import AsyncMock, Mock

 from twisted.internet import defer

@@ -221,7 +221,16 @@ class Graph:
 class StateTestCase(unittest.TestCase):
    def setUp(self) -> None:
        self.dummy_store = _DummyStore()
-        storage_controllers = Mock(main=self.dummy_store, state=self.dummy_store)
+
+        # Add a dummy epoch store that always retruns that we have all the
+        # necessary state groups.
+        dummy_epoch_store = AsyncMock()
+        dummy_epoch_store.check_state_groups_and_bump_deletion.return_value = []
+
+        storage_controllers = Mock(
+            main=self.dummy_store,
+            state=self.dummy_store,
+        )
        hs = Mock(
            spec_set=[
                "config",
@@ -241,7 +250,10 @@ class StateTestCase(unittest.TestCase):
        )
        clock = cast(Clock, MockClock())
        hs.config = default_config("tesths", True)
-        hs.get_datastores.return_value = Mock(main=self.dummy_store)
+        hs.get_datastores.return_value = Mock(
+            main=self.dummy_store,
+            state_epochs=dummy_epoch_store,
+        )
        hs.get_state_handler.return_value = None
        hs.get_clock.return_value = clock
        hs.get_macaroon_generator.return_value = MacaroonGenerator(
Author	SHA1	Message	Date
Erik Johnston	08df03d693	Delete unreferenced state groups	2025-01-26 14:36:00 +00:00
Erik Johnston	26b1e82fa8	Fix purging older state	2025-01-26 14:10:13 +00:00
Erik Johnston	53a74381de	Newsfile	2025-01-24 16:27:45 +00:00
Erik Johnston	64485760fc	Add locking to more safely delete state groups Currently we don't really have anything that stops us from deleting state groups when an in-flight event references it. This is a fairly rare race currently, but we want to be able to more aggresively delete state groups so it is important to address this to ensure that the database remains valid. See the class docstring of the new data store for an explanation for how this works.	2025-01-24 16:27:45 +00:00
				`@@ -0,0 +1 @@`
				`Fix rare edge case where state groups could be deleted while we are persisting new events that reference them.`