diff --git a/changelog.d/18235.misc b/changelog.d/18235.misc new file mode 100644 index 0000000000..1c3c42a73c --- /dev/null +++ b/changelog.d/18235.misc @@ -0,0 +1 @@ +Add caching support to media endpoints. diff --git a/synapse/media/_base.py b/synapse/media/_base.py index 7877df62fa..2e48d2fdc7 100644 --- a/synapse/media/_base.py +++ b/synapse/media/_base.py @@ -118,6 +118,9 @@ DEFAULT_MAX_TIMEOUT_MS = 20_000 # Maximum allowed timeout_ms for download and thumbnail requests MAXIMUM_ALLOWED_MAX_TIMEOUT_MS = 60_000 +# The ETag header value to use for immutable media. This can be anything. +_IMMUTABLE_ETAG = "1" + def respond_404(request: SynapseRequest) -> None: assert request.path is not None @@ -224,12 +227,7 @@ def add_file_headers( request.setHeader(b"Content-Disposition", disposition.encode("ascii")) - # cache for at least a day. - # XXX: we might want to turn this off for data we don't want to - # recommend caching as it's sensitive or private - or at least - # select private. don't bother setting Expires as all our - # clients are smart enough to be happy with Cache-Control - request.setHeader(b"Cache-Control", b"public,max-age=86400,s-maxage=86400") + _add_cache_headers(request) if file_size is not None: request.setHeader(b"Content-Length", b"%d" % (file_size,)) @@ -240,6 +238,26 @@ def add_file_headers( request.setHeader(b"X-Robots-Tag", "noindex, nofollow, noarchive, noimageindex") +def _add_cache_headers(request: Request) -> None: + """Adds the appropriate cache headers to the response""" + + # Cache on the client for at least a day. + # + # We set this to "public,s-maxage=0,proxy-revalidate" to allow CDNs to cache + # the media, so long as they "revalidate" the media on every request. By + # revalidate, we mean send the request to Synapse with a `If-None-Match` + # header, to which Synapse can either respond with a 304 if the user is + # authenticated/authorized, or a 401/403 if they're not. + request.setHeader( + b"Cache-Control", b"public,max-age=86400,s-maxage=0,proxy-revalidate" + ) + + # Set an ETag header to allow requesters to use it in requests to check if + # the cache is still valid. Since media is immutable (though may be + # deleted), we just set this to a constant. + request.setHeader(b"ETag", _IMMUTABLE_ETAG) + + # separators as defined in RFC2616. SP and HT are handled separately. # see _can_encode_filename_as_token. _FILENAME_SEPARATOR_CHARS = { @@ -336,13 +354,15 @@ async def respond_with_multipart_responder( from synapse.media.media_storage import MultipartFileConsumer + _add_cache_headers(request) + # note that currently the json_object is just {}, this will change when linked media # is implemented multipart_consumer = MultipartFileConsumer( clock, request, media_type, - {}, + {}, # Note: if we change this we need to change the returned ETag. disposition, media_length, ) @@ -419,6 +439,46 @@ async def respond_with_responder( finish_request(request) +def respond_with_304(request: SynapseRequest) -> None: + request.setResponseCode(304) + + # could alternatively use request.notifyFinish() and flip a flag when + # the Deferred fires, but since the flag is RIGHT THERE it seems like + # a waste. + if request._disconnected: + logger.warning( + "Not sending response to request %s, already disconnected.", request + ) + return None + + _add_cache_headers(request) + + request.finish() + + +def check_for_cached_entry_and_respond(request: SynapseRequest) -> bool: + """Check if the request has a conditional header that allows us to return a + 304 Not Modified response, and if it does, return a 304 response. + + This handles clients and intermediary proxies caching media. + This method assumes that the user has already been + authorised to request the media. + + Returns True if we have responded.""" + + # We've checked the user has access to the media, so we now check if it + # is a "conditional request" and we can just return a `304 Not Modified` + # response. Since media is immutable (though may be deleted), we just + # check this is the expected constant. + etag = request.getHeader("If-None-Match") + if etag == _IMMUTABLE_ETAG: + # Return a `304 Not modified`. + respond_with_304(request) + return True + + return False + + class Responder(ABC): """Represents a response that can be streamed to the requester. diff --git a/synapse/media/media_repository.py b/synapse/media/media_repository.py index f4d25a7b8b..cf4cba722a 100644 --- a/synapse/media/media_repository.py +++ b/synapse/media/media_repository.py @@ -52,6 +52,7 @@ from synapse.media._base import ( FileInfo, Responder, ThumbnailInfo, + check_for_cached_entry_and_respond, get_filename_from_headers, respond_404, respond_with_multipart_responder, @@ -459,6 +460,11 @@ class MediaRepository: self.mark_recently_accessed(None, media_id) + # Once we've checked auth we can return early if the media is cached on + # the client + if check_for_cached_entry_and_respond(request): + return + media_type = media_info.media_type if not media_type: media_type = "application/octet-stream" @@ -538,6 +544,17 @@ class MediaRepository: allow_authenticated, ) + # Check if the media is cached on the client, if so return 304. We need + # to do this after we have fetched remote media, as we need it to do the + # auth. + if check_for_cached_entry_and_respond(request): + # We always need to use the responder. + if responder: + with responder: + pass + + return + # We deliberately stream the file outside the lock if responder and media_info: upload_name = name if name else media_info.upload_name diff --git a/synapse/media/thumbnailer.py b/synapse/media/thumbnailer.py index d6b8ce4a09..5d9afda322 100644 --- a/synapse/media/thumbnailer.py +++ b/synapse/media/thumbnailer.py @@ -34,6 +34,7 @@ from synapse.logging.opentracing import trace from synapse.media._base import ( FileInfo, ThumbnailInfo, + check_for_cached_entry_and_respond, respond_404, respond_with_file, respond_with_multipart_responder, @@ -294,6 +295,11 @@ class ThumbnailProvider: if media_info.authenticated: raise NotFoundError() + # Once we've checked auth we can return early if the media is cached on + # the client + if check_for_cached_entry_and_respond(request): + return + thumbnail_infos = await self.store.get_local_media_thumbnails(media_id) await self._select_and_respond_with_thumbnail( request, @@ -334,6 +340,11 @@ class ThumbnailProvider: if media_info.authenticated: raise NotFoundError() + # Once we've checked auth we can return early if the media is cached on + # the client + if check_for_cached_entry_and_respond(request): + return + thumbnail_infos = await self.store.get_local_media_thumbnails(media_id) for info in thumbnail_infos: t_w = info.width == desired_width @@ -431,6 +442,10 @@ class ThumbnailProvider: respond_404(request) return + # Check if the media is cached on the client, if so return 304. + if check_for_cached_entry_and_respond(request): + return + thumbnail_infos = await self.store.get_remote_media_thumbnails( server_name, media_id ) @@ -510,6 +525,10 @@ class ThumbnailProvider: if media_info.authenticated: raise NotFoundError() + # Check if the media is cached on the client, if so return 304. + if check_for_cached_entry_and_respond(request): + return + thumbnail_infos = await self.store.get_remote_media_thumbnails( server_name, media_id ) diff --git a/tests/federation/test_federation_media.py b/tests/federation/test_federation_media.py index e66aae499b..9c92003ce5 100644 --- a/tests/federation/test_federation_media.py +++ b/tests/federation/test_federation_media.py @@ -147,6 +147,45 @@ class FederationMediaDownloadsTest(unittest.FederatingHomeserverTestCase): found_file = any(SMALL_PNG in field for field in stripped_bytes) self.assertTrue(found_file) + def test_federation_etag(self) -> None: + """Test that federation ETags work""" + + content = io.BytesIO(b"file_to_stream") + content_uri = self.get_success( + self.media_repo.create_content( + "text/plain", + "test_upload", + content, + 46, + UserID.from_string("@user_id:whatever.org"), + ) + ) + + channel = self.make_signed_federation_request( + "GET", + f"/_matrix/federation/v1/media/download/{content_uri.media_id}", + ) + self.pump() + self.assertEqual(200, channel.code) + + # We expect exactly one ETag header. + etags = channel.headers.getRawHeaders("ETag") + self.assertIsNotNone(etags) + assert etags is not None # For mypy + self.assertEqual(len(etags), 1) + etag = etags[0] + + # Refetching with the etag should result in 304 and empty body. + channel = self.make_signed_federation_request( + "GET", + f"/_matrix/federation/v1/media/download/{content_uri.media_id}", + custom_headers=[("If-None-Match", etag)], + ) + self.pump() + self.assertEqual(channel.code, 304) + self.assertEqual(channel.is_finished(), True) + self.assertNotIn("body", channel.result) + class FederationThumbnailTest(unittest.FederatingHomeserverTestCase): def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: diff --git a/tests/rest/client/test_media.py b/tests/rest/client/test_media.py index 7d7dbd342b..0e3e370ee8 100644 --- a/tests/rest/client/test_media.py +++ b/tests/rest/client/test_media.py @@ -2676,3 +2676,113 @@ class AuthenticatedMediaTestCase(unittest.HomeserverTestCase): access_token=self.tok, ) self.assertEqual(channel10.code, 200) + + def test_authenticated_media_etag(self) -> None: + """Test that ETag works correctly with authenticated media over client + APIs""" + + # upload some local media with authentication on + channel = self.make_request( + "POST", + "_matrix/media/v3/upload?filename=test_png_upload", + SMALL_PNG, + self.tok, + shorthand=False, + content_type=b"image/png", + custom_headers=[("Content-Length", str(67))], + ) + self.assertEqual(channel.code, 200) + res = channel.json_body.get("content_uri") + assert res is not None + uri = res.split("mxc://")[1] + + # Check standard media endpoint + self._check_caching(f"/download/{uri}") + + # check thumbnails as well + params = "?width=32&height=32&method=crop" + self._check_caching(f"/thumbnail/{uri}{params}") + + # Inject a piece of remote media. + file_id = "abcdefg12345" + file_info = FileInfo(server_name="lonelyIsland", file_id=file_id) + + media_storage = self.hs.get_media_repository().media_storage + + ctx = media_storage.store_into_file(file_info) + (f, fname) = self.get_success(ctx.__aenter__()) + f.write(SMALL_PNG) + self.get_success(ctx.__aexit__(None, None, None)) + + # we write the authenticated status when storing media, so this should pick up + # config and authenticate the media + self.get_success( + self.store.store_cached_remote_media( + origin="lonelyIsland", + media_id="52", + media_type="image/png", + media_length=1, + time_now_ms=self.clock.time_msec(), + upload_name="remote_test.png", + filesystem_id=file_id, + ) + ) + + # ensure we have thumbnails for the non-dynamic code path + if self.extra_config == {"dynamic_thumbnails": False}: + self.get_success( + self.repo._generate_thumbnails( + "lonelyIsland", "52", file_id, "image/png" + ) + ) + + self._check_caching("/download/lonelyIsland/52") + + params = "?width=32&height=32&method=crop" + self._check_caching(f"/thumbnail/lonelyIsland/52{params}") + + def _check_caching(self, path: str) -> None: + """ + Checks that: + 1. fetching the path returns an ETag header + 2. refetching with the ETag returns a 304 without a body + 3. refetching with the ETag but through unauthenticated endpoint + returns 404 + """ + + # Request media over authenticated endpoint, should be found + channel1 = self.make_request( + "GET", + f"/_matrix/client/v1/media{path}", + access_token=self.tok, + shorthand=False, + ) + self.assertEqual(channel1.code, 200) + + # Should have a single ETag field + etags = channel1.headers.getRawHeaders("ETag") + self.assertIsNotNone(etags) + assert etags is not None # For mypy + self.assertEqual(len(etags), 1) + etag = etags[0] + + # Refetching with the etag should result in 304 and empty body. + channel2 = self.make_request( + "GET", + f"/_matrix/client/v1/media{path}", + access_token=self.tok, + shorthand=False, + custom_headers=[("If-None-Match", etag)], + ) + self.assertEqual(channel2.code, 304) + self.assertEqual(channel2.is_finished(), True) + self.assertNotIn("body", channel2.result) + + # Refetching with the etag but no access token should result in 404. + channel3 = self.make_request( + "GET", + f"/_matrix/media/r0{path}", + shorthand=False, + custom_headers=[("If-None-Match", etag)], + ) + self.assertEqual(channel3.code, 404)