diff --git a/app/models/swf_asset.rb b/app/models/swf_asset.rb index 3f7db65b..3b6ecae9 100644 --- a/app/models/swf_asset.rb +++ b/app/models/swf_asset.rb @@ -113,31 +113,33 @@ class SwfAsset < ApplicationRecord } end - MANIFEST_PATTERN = %r{^https://images.neopets.com/(?.+)/(?[0-9]+)(?_[^/]+)?/manifest\.json} - def html5_image_url - return nil if manifest_url.nil? + def manifest + NeopetsMediaArchive.load_json(manifest_url) + end - # HACK: Just assuming all of these were well-formed by the same process, - # and infer the image URL from the manifest URL! But strictly speaking we - # should be reading the manifest to check! - match = manifest_url.match(MANIFEST_PATTERN) - return nil if match.nil? - - "https://images.neopets.com/#{match[:prefix]}/" + - "#{match[:id]}#{match[:hash_part]}/#{match[:id]}.png" + MANIFEST_BASE_URL = Addressable::URI.parse("https://images.neopets.com") + def manifest_asset_urls + return {} if manifest_url.nil? + + begin + # Organize the asset URLs by file extension, grab the ones we want, and + # convert them from paths to full URLs. + manifest["cpmanifest"]["assets"][0]["asset_data"]. + to_h { |a| [a["file_ext"].to_sym, a] }. + slice(:png, :svg, :js) + .transform_values { |a| (MANIFEST_BASE_URL + a["url"]).to_s } + rescue StandardError => error + Rails.logger.error "Could not read URLs from manifest: #{error.full_message}" + return {} + end + end + + def html5_image_url + manifest_asset_urls[:png] end def html5_svg_url - return nil if manifest_url.nil? - - # HACK: Just assuming all of these were well-formed by the same process, - # and infer the image URL from the manifest URL! But strictly speaking we - # should be reading the manifest to check! - match = manifest_url.match(MANIFEST_PATTERN) - return nil if match.nil? - - "https://images.neopets.com/#{match[:prefix]}/" + - "#{match[:id]}#{match[:hash_part]}/#{match[:id]}.svg" + manifest_asset_urls[:svg] end def known_glitches diff --git a/app/services/neopets_media_archive.rb b/app/services/neopets_media_archive.rb new file mode 100644 index 00000000..7d45eced --- /dev/null +++ b/app/services/neopets_media_archive.rb @@ -0,0 +1,119 @@ +require "addressable/template" +require "addressable/uri" +require "httparty" +require "json" + +# The Neopets Media Archive is a service that mirrors images.neopets.com files +# locally. You can request a file from it, and we'll serve it from disk if we +# have it, or request and save it if not. +# +# This is a bit different than a cache, because the intention is not just +# optimization but that we *want* to be saving images.neopets.com as a +# long-term archive, not dependent on their services having 100% uptime in +# order for us to operate. We never discard old files, we just keep going! +module NeopetsMediaArchive + include HTTParty + base_uri "https://images.neopets.com/" + + OLD_MANIFEST_PATH_TEMPLATE = Addressable::Template.new( + "https://images.neopets.com/cp/{short_type}/data/{id1}/{id2}/{id3}/{id}_{hash}/manifest.json" + ) + NEW_MANIFEST_PATH_TEMPLATE = Addressable::Template.new( + "https://images.neopets.com/cp/{short_type}/data/{id1}/{id2}/{id3}/{id}/manifest.json" + ) + + # Load the file from the given `images.neopets.com` URI, as JSON. + def self.load_json(uri) + JSON.parse(load_file(uri)) + end + + # Load the file from the given `images.neopets.com` URI. + def self.load_file(uri, return_content: true) + local_path = local_file_path(uri) + + # Read the file locally if we have it. + if return_content + begin + content = File.read(local_path) + debug "Loaded source file from filesystem: #{local_path}" + return content + rescue Errno::ENOENT + # If it doesn't exist, that's fine: just move on and download it. + end + else + # When we don't need the content, "loading" the file is just ensuring + # it exists. If it doesn't, we'll move on and load it from source. + # (We use this when preloading files, to save the cost of reading files + # we're not ready to use yet.) + if File.exist?(local_path) + debug "Source file is already loaded, skipping: #{local_path}" + return + end + end + + # Download the file from the origin, then save a copy for next time. + response = load_file_from_origin(uri) + info "Loaded source file from origin: #{uri}" + content = response.body + local_path.dirname.mkpath + File.write(local_path, content) + info "Wrote source file to filesystem: #{local_path}" + + return_content ? content : nil + end + + # Load the file from the given `images.neopets.com` URI, but don't return its + # content. This can be faster in cases where the file's content isn't + # relevant to us, and we just want to ensure it exists. + def self.preload_file(uri) + load_file(uri, return_content: false) + end + + # Load the file from the given `images.neopets.com` URI, directly from the + # source, without checking the local filesystem. + def self.load_file_from_origin(uri) + unless Addressable::URI.parse(uri).origin == "https://images.neopets.com" + raise ArgumentError, "NeopetsMediaArchive can only load from " + + "https://images.neopets.com, but got #{uri}" + end + + response = get(uri) + if response.code == 404 + raise NotFound, "origin server returned 404: #{uri}" + elsif response.code != 200 + raise "expected status 200 but got #{response.code} (#{uri})" + end + response + end + + def self.path_within_archive(uri) + uri = Addressable::URI.parse(uri) + path = uri.host + uri.path + + # We include the query string as part of the file path, which is a bit odd! + # But Neopets often uses this for cache-busting, so we do need a mechanism + # for knowing whether we're holding the right version of the file. We could + # also consider storing the file by just its normal path, but with some + # metadata to track versioning information (e.g. a sqlite db, or a metadata + # file in the same directory). + path += "?" + uri.query if !uri.query.nil? && !uri.query.empty? + + path + end + + def self.local_file_path(uri) + Rails.configuration.neopets_media_archive_root + path_within_archive(uri) + end + + class NotFound < StandardError; end + + private + + def self.info(message) + Rails.logger.info "[NeopetsMediaArchive] #{message}" + end + + def self.debug(message) + Rails.logger.debug "[NeopetsMediaArchive] #{message}" + end +end diff --git a/config/environments/development.rb b/config/environments/development.rb index d57dad98..6eac4c70 100644 --- a/config/environments/development.rb +++ b/config/environments/development.rb @@ -107,4 +107,9 @@ Rails.application.configure do # override this with the IMPRESS_2020_ORIGIN environment variable!) config.impress_2020_origin = ENV.fetch("IMPRESS_2020_ORIGIN", "http://localhost:4000") + + # Save the Neopets Media Archive in the local `tmp` folder. (In production, + # we keep this in a long-term location instead!) + config.neopets_media_archive_root = Rails.root / "tmp" / + "neopets_media_archive" / "development" end diff --git a/config/environments/production.rb b/config/environments/production.rb index 37c5cfb3..20858d10 100644 --- a/config/environments/production.rb +++ b/config/environments/production.rb @@ -126,4 +126,8 @@ Rails.application.configure do # IMPRESS_2020_ORIGIN environment variable!) config.impress_2020_origin = ENV.fetch("IMPRESS_2020_ORIGIN", "https://impress-2020.openneo.net") + + # Save the Neopets Media Archive in `/var/lib/neopets-media-archive`, a + # long-term storage location. + config.neopets_media_archive_root = "/var/lib/neopets-media-archive" end diff --git a/config/environments/test.rb b/config/environments/test.rb index d7dee4e8..2f300377 100644 --- a/config/environments/test.rb +++ b/config/environments/test.rb @@ -66,4 +66,9 @@ Rails.application.configure do # override this with the IMPRESS_2020_ORIGIN environment variable!) config.impress_2020_origin = ENV.fetch("IMPRESS_2020_ORIGIN", "http://localhost:4000") + + # Save the Neopets Media Archive in the local `tmp` folder. (In production, + # we keep this in a long-term location instead!) + config.neopets_media_archive_root = Rails.root / "tmp" / + "neopets_media_archive" / "test" end diff --git a/deploy/setup.yml b/deploy/setup.yml index 318a4f9f..4a573042 100644 --- a/deploy/setup.yml +++ b/deploy/setup.yml @@ -402,6 +402,14 @@ password: "{{ mysql_user_password_2020 }}" priv: "openneo_impress.*:ALL,openneo_id.*:ALL" + - name: Create the Neopets Media Archive data directory + file: + path: /var/lib/neopets-media-archive + owner: impress + group: impress + mode: "755" + state: directory + handlers: - name: Reload nginx systemd: