From 2cc46703b9c848b058141440a5c9f62d610d9fee Mon Sep 17 00:00:00 2001 From: Emi Matchu Date: Fri, 23 Feb 2024 12:02:39 -0800 Subject: [PATCH] Create NeopetsMediaArchive, read the actual manifests for Alt Styles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Neopets Media Archive is a service that mirrors `images.neopets.com` over time! Right now we're starting by just loading manifests, and using them to replace the hacks we used for determining the Alt Style PNG and SVG URLs; but with time, I want to load *all* customization media files, to have our own secondary file source that isn't dependent on Neopets to always be up. Impress 2020 already caches manifest files, but this strategy is different in two ways: 1. We're using the filesystem rather than a database column. (That is, manifest data is kinda duplicated in the system right now!) This is because I intend to go in a more file-y way long-term anyway, to load more than just the manifests. 2. Impress 2020 guesses at the manifest URLs by pattern, and reloads them on a regular basis. Instead, we use the modeling system: when TNT changes the URL of a manifest by appending a new `?v=` query string to it, this system will consider it a new URL, and will load the new copy accordingly. Fun fact, I actually have been prototyping some of this stuff in a side project I'd named `impress-media-server`! It's a little Sinatra app that indeed *does* save all the files needed for customization, and can generate lightweight lil preview iframes and images pretty easily. I had initially been planning this as a separate service, but after thinking over the arch a bit, I think it'll go smoother to just give the main app all the same access and awareness—and I wrote it all in Ruby and plain HTML/JS/CSS, so it should be pretty easy to port over bit-by-bit! Anyway, only Alt Styles use this for now, but my motivation is to be able to use more-correct asset URL logic to be able to finally swap over wardrobe-2020's item search to impress.openneo.net's item search API endpoint—which will get "Items You Own" searches working again, and whittle down one of the last big things Impress 2020 can do that the main app can't. Let's see how it goes! --- app/models/swf_asset.rb | 44 +++++----- app/services/neopets_media_archive.rb | 119 ++++++++++++++++++++++++++ config/environments/development.rb | 5 ++ config/environments/production.rb | 4 + config/environments/test.rb | 5 ++ deploy/setup.yml | 8 ++ 6 files changed, 164 insertions(+), 21 deletions(-) create mode 100644 app/services/neopets_media_archive.rb diff --git a/app/models/swf_asset.rb b/app/models/swf_asset.rb index 3f7db65b..3b6ecae9 100644 --- a/app/models/swf_asset.rb +++ b/app/models/swf_asset.rb @@ -113,31 +113,33 @@ class SwfAsset < ApplicationRecord } end - MANIFEST_PATTERN = %r{^https://images.neopets.com/(?.+)/(?[0-9]+)(?_[^/]+)?/manifest\.json} - def html5_image_url - return nil if manifest_url.nil? + def manifest + NeopetsMediaArchive.load_json(manifest_url) + end - # HACK: Just assuming all of these were well-formed by the same process, - # and infer the image URL from the manifest URL! But strictly speaking we - # should be reading the manifest to check! - match = manifest_url.match(MANIFEST_PATTERN) - return nil if match.nil? - - "https://images.neopets.com/#{match[:prefix]}/" + - "#{match[:id]}#{match[:hash_part]}/#{match[:id]}.png" + MANIFEST_BASE_URL = Addressable::URI.parse("https://images.neopets.com") + def manifest_asset_urls + return {} if manifest_url.nil? + + begin + # Organize the asset URLs by file extension, grab the ones we want, and + # convert them from paths to full URLs. + manifest["cpmanifest"]["assets"][0]["asset_data"]. + to_h { |a| [a["file_ext"].to_sym, a] }. + slice(:png, :svg, :js) + .transform_values { |a| (MANIFEST_BASE_URL + a["url"]).to_s } + rescue StandardError => error + Rails.logger.error "Could not read URLs from manifest: #{error.full_message}" + return {} + end + end + + def html5_image_url + manifest_asset_urls[:png] end def html5_svg_url - return nil if manifest_url.nil? - - # HACK: Just assuming all of these were well-formed by the same process, - # and infer the image URL from the manifest URL! But strictly speaking we - # should be reading the manifest to check! - match = manifest_url.match(MANIFEST_PATTERN) - return nil if match.nil? - - "https://images.neopets.com/#{match[:prefix]}/" + - "#{match[:id]}#{match[:hash_part]}/#{match[:id]}.svg" + manifest_asset_urls[:svg] end def known_glitches diff --git a/app/services/neopets_media_archive.rb b/app/services/neopets_media_archive.rb new file mode 100644 index 00000000..7d45eced --- /dev/null +++ b/app/services/neopets_media_archive.rb @@ -0,0 +1,119 @@ +require "addressable/template" +require "addressable/uri" +require "httparty" +require "json" + +# The Neopets Media Archive is a service that mirrors images.neopets.com files +# locally. You can request a file from it, and we'll serve it from disk if we +# have it, or request and save it if not. +# +# This is a bit different than a cache, because the intention is not just +# optimization but that we *want* to be saving images.neopets.com as a +# long-term archive, not dependent on their services having 100% uptime in +# order for us to operate. We never discard old files, we just keep going! +module NeopetsMediaArchive + include HTTParty + base_uri "https://images.neopets.com/" + + OLD_MANIFEST_PATH_TEMPLATE = Addressable::Template.new( + "https://images.neopets.com/cp/{short_type}/data/{id1}/{id2}/{id3}/{id}_{hash}/manifest.json" + ) + NEW_MANIFEST_PATH_TEMPLATE = Addressable::Template.new( + "https://images.neopets.com/cp/{short_type}/data/{id1}/{id2}/{id3}/{id}/manifest.json" + ) + + # Load the file from the given `images.neopets.com` URI, as JSON. + def self.load_json(uri) + JSON.parse(load_file(uri)) + end + + # Load the file from the given `images.neopets.com` URI. + def self.load_file(uri, return_content: true) + local_path = local_file_path(uri) + + # Read the file locally if we have it. + if return_content + begin + content = File.read(local_path) + debug "Loaded source file from filesystem: #{local_path}" + return content + rescue Errno::ENOENT + # If it doesn't exist, that's fine: just move on and download it. + end + else + # When we don't need the content, "loading" the file is just ensuring + # it exists. If it doesn't, we'll move on and load it from source. + # (We use this when preloading files, to save the cost of reading files + # we're not ready to use yet.) + if File.exist?(local_path) + debug "Source file is already loaded, skipping: #{local_path}" + return + end + end + + # Download the file from the origin, then save a copy for next time. + response = load_file_from_origin(uri) + info "Loaded source file from origin: #{uri}" + content = response.body + local_path.dirname.mkpath + File.write(local_path, content) + info "Wrote source file to filesystem: #{local_path}" + + return_content ? content : nil + end + + # Load the file from the given `images.neopets.com` URI, but don't return its + # content. This can be faster in cases where the file's content isn't + # relevant to us, and we just want to ensure it exists. + def self.preload_file(uri) + load_file(uri, return_content: false) + end + + # Load the file from the given `images.neopets.com` URI, directly from the + # source, without checking the local filesystem. + def self.load_file_from_origin(uri) + unless Addressable::URI.parse(uri).origin == "https://images.neopets.com" + raise ArgumentError, "NeopetsMediaArchive can only load from " + + "https://images.neopets.com, but got #{uri}" + end + + response = get(uri) + if response.code == 404 + raise NotFound, "origin server returned 404: #{uri}" + elsif response.code != 200 + raise "expected status 200 but got #{response.code} (#{uri})" + end + response + end + + def self.path_within_archive(uri) + uri = Addressable::URI.parse(uri) + path = uri.host + uri.path + + # We include the query string as part of the file path, which is a bit odd! + # But Neopets often uses this for cache-busting, so we do need a mechanism + # for knowing whether we're holding the right version of the file. We could + # also consider storing the file by just its normal path, but with some + # metadata to track versioning information (e.g. a sqlite db, or a metadata + # file in the same directory). + path += "?" + uri.query if !uri.query.nil? && !uri.query.empty? + + path + end + + def self.local_file_path(uri) + Rails.configuration.neopets_media_archive_root + path_within_archive(uri) + end + + class NotFound < StandardError; end + + private + + def self.info(message) + Rails.logger.info "[NeopetsMediaArchive] #{message}" + end + + def self.debug(message) + Rails.logger.debug "[NeopetsMediaArchive] #{message}" + end +end diff --git a/config/environments/development.rb b/config/environments/development.rb index d57dad98..6eac4c70 100644 --- a/config/environments/development.rb +++ b/config/environments/development.rb @@ -107,4 +107,9 @@ Rails.application.configure do # override this with the IMPRESS_2020_ORIGIN environment variable!) config.impress_2020_origin = ENV.fetch("IMPRESS_2020_ORIGIN", "http://localhost:4000") + + # Save the Neopets Media Archive in the local `tmp` folder. (In production, + # we keep this in a long-term location instead!) + config.neopets_media_archive_root = Rails.root / "tmp" / + "neopets_media_archive" / "development" end diff --git a/config/environments/production.rb b/config/environments/production.rb index 37c5cfb3..20858d10 100644 --- a/config/environments/production.rb +++ b/config/environments/production.rb @@ -126,4 +126,8 @@ Rails.application.configure do # IMPRESS_2020_ORIGIN environment variable!) config.impress_2020_origin = ENV.fetch("IMPRESS_2020_ORIGIN", "https://impress-2020.openneo.net") + + # Save the Neopets Media Archive in `/var/lib/neopets-media-archive`, a + # long-term storage location. + config.neopets_media_archive_root = "/var/lib/neopets-media-archive" end diff --git a/config/environments/test.rb b/config/environments/test.rb index d7dee4e8..2f300377 100644 --- a/config/environments/test.rb +++ b/config/environments/test.rb @@ -66,4 +66,9 @@ Rails.application.configure do # override this with the IMPRESS_2020_ORIGIN environment variable!) config.impress_2020_origin = ENV.fetch("IMPRESS_2020_ORIGIN", "http://localhost:4000") + + # Save the Neopets Media Archive in the local `tmp` folder. (In production, + # we keep this in a long-term location instead!) + config.neopets_media_archive_root = Rails.root / "tmp" / + "neopets_media_archive" / "test" end diff --git a/deploy/setup.yml b/deploy/setup.yml index 318a4f9f..4a573042 100644 --- a/deploy/setup.yml +++ b/deploy/setup.yml @@ -402,6 +402,14 @@ password: "{{ mysql_user_password_2020 }}" priv: "openneo_impress.*:ALL,openneo_id.*:ALL" + - name: Create the Neopets Media Archive data directory + file: + path: /var/lib/neopets-media-archive + owner: impress + group: impress + mode: "755" + state: directory + handlers: - name: Reload nginx systemd: