Compare commits

..

No commits in common. "992954ce89983fa96e35aff1da872d3c8406ea28" and "3f449310d667d7242482cc91320a3192fcd576df" have entirely different histories.

8 changed files with 100 additions and 318 deletions

View file

@ -8,10 +8,6 @@ class AltStylesController < ApplicationController
@alt_styles = @alt_styles.merge(@species.alt_styles)
end
# We're going to link to the HTML5 image URL, so make sure we have all the
# manifests ready!
SwfAsset.preload_manifests @alt_styles.map(&:swf_assets).flatten
respond_to do |format|
format.html { render }
format.json {

View file

@ -1,6 +1,3 @@
require 'async'
require 'async/barrier'
require 'async/semaphore'
require 'fileutils'
require 'uri'
@ -116,39 +113,31 @@ class SwfAsset < ApplicationRecord
}
end
def manifest
raise "manifest_url is blank" if manifest_url.blank?
NeopetsMediaArchive.load_json(manifest_url)
end
def preload_manifest
raise "manifest_url is blank" if manifest_url.blank?
NeopetsMediaArchive.preload_file(manifest_url)
end
MANIFEST_BASE_URL = Addressable::URI.parse("https://images.neopets.com")
def manifest_asset_urls
return {} if manifest_url.nil?
begin
# Organize the asset URLs by file extension, grab the ones we want, and
# convert them from paths to full URLs.
manifest["cpmanifest"]["assets"][0]["asset_data"].
to_h { |a| [a["file_ext"].to_sym, a] }.
slice(:png, :svg, :js)
.transform_values { |a| (MANIFEST_BASE_URL + a["url"]).to_s }
rescue StandardError => error
Rails.logger.error "Could not read URLs from manifest: #{error.full_message}"
return {}
end
end
MANIFEST_PATTERN = %r{^https://images.neopets.com/(?<prefix>.+)/(?<id>[0-9]+)(?<hash_part>_[^/]+)?/manifest\.json}
def html5_image_url
manifest_asset_urls[:png]
return nil if manifest_url.nil?
# HACK: Just assuming all of these were well-formed by the same process,
# and infer the image URL from the manifest URL! But strictly speaking we
# should be reading the manifest to check!
match = manifest_url.match(MANIFEST_PATTERN)
return nil if match.nil?
"https://images.neopets.com/#{match[:prefix]}/" +
"#{match[:id]}#{match[:hash_part]}/#{match[:id]}.png"
end
def html5_svg_url
manifest_asset_urls[:svg]
return nil if manifest_url.nil?
# HACK: Just assuming all of these were well-formed by the same process,
# and infer the image URL from the manifest URL! But strictly speaking we
# should be reading the manifest to check!
match = manifest_url.match(MANIFEST_PATTERN)
return nil if match.nil?
"https://images.neopets.com/#{match[:prefix]}/" +
"#{match[:id]}#{match[:hash_part]}/#{match[:id]}.svg"
end
def known_glitches
@ -238,38 +227,6 @@ class SwfAsset < ApplicationRecord
))
end
# Given a list of SWF assets, ensure all of their manifests are loaded, with
# fast concurrent execution!
def self.preload_manifests(swf_assets)
# Blocks all tasks beneath it.
barrier = Async::Barrier.new
Sync do
# Only allow 10 manifests to be loaded at a time.
semaphore = Async::Semaphore.new(10, parent: barrier)
# Load all the manifests in async tasks. This will load them 10 at a time
# rather than all at once (because of the semaphore), and the
# NeopetsMediaArchive will share a pool of persistent connections for
# them.
swf_assets.map do |swf_asset|
semaphore.async do
begin
swf_asset.preload_manifest
rescue StandardError => error
Rails.logger.error "Could not preload manifest for asset " +
"#{swf_asset.id} (#{swf_asset.manifest_url}): #{error.message}"
end
end
end
# Wait until all tasks are done.
barrier.wait
ensure
barrier.stop # If something goes wrong, clean up all tasks.
end
end
before_save do
# If an asset body ID changes, that means more than one body ID has been
# linked to it, meaning that it's probably wearable by all bodies.

View file

@ -1,120 +0,0 @@
require "addressable/uri"
require "async/http/internet/instance"
require "json"
# The Neopets Media Archive is a service that mirrors images.neopets.com files
# locally. You can request a file from it, and we'll serve it from disk if we
# have it, or request and save it if not.
#
# This is a bit different than a cache, because the intention is not just
# optimization but that we *want* to be saving images.neopets.com as a
# long-term archive, not dependent on their services having 100% uptime in
# order for us to operate. We never discard old files, we just keep going!
module NeopetsMediaArchive
# Share a pool of persistent connections, rather than reconnecting on
# each request. (This library does that automatically!)
INTERNET = Async::HTTP::Internet.instance
ROOT_PATH = Pathname.new(Rails.configuration.neopets_media_archive_root)
# Load the file from the given `images.neopets.com` URI, as JSON.
def self.load_json(uri)
JSON.parse(load_file(uri))
end
# Load the file from the given `images.neopets.com` URI.
def self.load_file(uri, return_content: true)
local_path = local_file_path(uri)
# Read the file locally if we have it.
if return_content
begin
content = File.read(local_path)
debug "Loaded source file from filesystem: #{local_path}"
return content
rescue Errno::ENOENT
# If it doesn't exist, that's fine: just move on and download it.
end
else
# When we don't need the content, "loading" the file is just ensuring
# it exists. If it doesn't, we'll move on and load it from source.
# (We use this when preloading files, to save the cost of reading files
# we're not ready to use yet.)
if File.exist?(local_path)
debug "Source file is already loaded, skipping: #{local_path}"
return
end
end
# Download the file from the origin, then save a copy for next time.
content = load_file_from_origin(uri)
info "Loaded source file from origin: #{uri}"
local_path.dirname.mkpath
File.write(local_path, content)
info "Wrote source file to filesystem: #{local_path}"
return_content ? content : nil
end
# Load the file from the given `images.neopets.com` URI, but don't return its
# content. This can be faster in cases where the file's content isn't
# relevant to us, and we just want to ensure it exists.
def self.preload_file(uri)
load_file(uri, return_content: false)
end
# Load the file from the given `images.neopets.com` URI, directly from the
# source, without checking the local filesystem.
def self.load_file_from_origin(uri)
unless Addressable::URI.parse(uri).origin == "https://images.neopets.com"
raise ArgumentError, "NeopetsMediaArchive can only load from " +
"https://images.neopets.com, but got #{uri}"
end
# By running this request in a `Sync` block, we make this method look
# synchronous to the caller—but if run in the context of an async task, it
# will pause execution and move onto other work until the request is done.
# We use this in the `swf_assets:manifests:load` task to perform many
# requests in parallel!
Sync do
response = INTERNET.get(uri)
if response.status == 404
raise NotFound, "origin server returned 404: #{uri}"
elsif response.status != 200
raise "expected status 200 but got #{response.status} (#{uri})"
end
response.body.read
end
end
def self.path_within_archive(uri)
uri = Addressable::URI.parse(uri)
path = uri.host + uri.path
# We include the query string as part of the file path, which is a bit odd!
# But Neopets often uses this for cache-busting, so we do need a mechanism
# for knowing whether we're holding the right version of the file. We could
# also consider storing the file by just its normal path, but with some
# metadata to track versioning information (e.g. a sqlite db, or a metadata
# file in the same directory).
path += "?" + uri.query if !uri.query.nil? && !uri.query.empty?
path
end
def self.local_file_path(uri)
ROOT_PATH + path_within_archive(uri)
end
class NotFound < StandardError; end
private
def self.info(message)
Rails.logger.info "[NeopetsMediaArchive] #{message}"
end
def self.debug(message)
Rails.logger.debug "[NeopetsMediaArchive] #{message}"
end
end

View file

@ -107,9 +107,4 @@ Rails.application.configure do
# override this with the IMPRESS_2020_ORIGIN environment variable!)
config.impress_2020_origin = ENV.fetch("IMPRESS_2020_ORIGIN",
"http://localhost:4000")
# Save the Neopets Media Archive in the local `tmp` folder. (In production,
# we keep this in a long-term location instead!)
config.neopets_media_archive_root = Rails.root / "tmp" /
"neopets_media_archive" / "development"
end

View file

@ -126,8 +126,4 @@ Rails.application.configure do
# IMPRESS_2020_ORIGIN environment variable!)
config.impress_2020_origin = ENV.fetch("IMPRESS_2020_ORIGIN",
"https://impress-2020.openneo.net")
# Save the Neopets Media Archive in `/var/lib/neopets-media-archive`, a
# long-term storage location.
config.neopets_media_archive_root = "/var/lib/neopets-media-archive"
end

View file

@ -66,9 +66,4 @@ Rails.application.configure do
# override this with the IMPRESS_2020_ORIGIN environment variable!)
config.impress_2020_origin = ENV.fetch("IMPRESS_2020_ORIGIN",
"http://localhost:4000")
# Save the Neopets Media Archive in the local `tmp` folder. (In production,
# we keep this in a long-term location instead!)
config.neopets_media_archive_root = Rails.root / "tmp" /
"neopets_media_archive" / "test"
end

View file

@ -402,14 +402,6 @@
password: "{{ mysql_user_password_2020 }}"
priv: "openneo_impress.*:ALL,openneo_id.*:ALL"
- name: Create the Neopets Media Archive data directory
file:
path: /var/lib/neopets-media-archive
owner: impress
group: impress
mode: "755"
state: directory
handlers:
- name: Reload nginx
systemd:

View file

@ -35,43 +35,15 @@ namespace :swf_assets do
end
end
namespace :manifests do
desc "Save all known manifests to the Neopets Media Archive"
task load: [:environment] do
# Log errors to STDOUT, but we don't need the info messages about
# successful saves.
Rails.logger = Logger.new(STDOUT, level: :error)
# Find all the manifests with known URLs. (We don't have a database
# filter for "do we already have the manifest downloaded", but that's
# okay, the preload method will quickly check that for us!)
swf_assets = SwfAsset.where.not(manifest_url: nil)
total_count = swf_assets.count
puts "Found #{total_count} assets with manifests"
# For each batch of 1000 assets, load their manifests concurrently.
# Wrap everything in a top-level sync, so keyboard interrupts will
# propagate all the way up to here, instead of just cancelling the
# current batch.
Sync do
saved_count = 0
swf_assets.find_in_batches(batch_size: 1000) do |swf_assets|
SwfAsset.preload_manifests(swf_assets)
saved_count += swf_assets.size
puts "Loaded #{saved_count} of #{total_count} manifests"
end
end
end
desc "Backfill manifest_url for SwfAsset models"
task urls: [:environment] do
task manifests: [:environment] do
timeout = ENV.fetch("TIMEOUT", "5").to_i
assets = SwfAsset.where(manifest_url: nil)
count = assets.count
puts "Found #{count} assets without manifests"
Sync do
Async do
# Share a pool of persistent connections, rather than reconnecting on
# each request. (This library does that automatically!)
internet = Async::HTTP::Internet.instance
@ -136,7 +108,6 @@ namespace :swf_assets do
end
end
end
end
end
SWF_URL_PATTERN = %r{^(?:https?:)?//images\.neopets\.com/cp/(bio|items)/swf/(.+?)_([a-z0-9]+)\.swf$}