Compare commits
4 commits
3f449310d6
...
992954ce89
Author | SHA1 | Date | |
---|---|---|---|
992954ce89 | |||
9a3b33ea2f | |||
f6cece9a59 | |||
2cc46703b9 |
8 changed files with 321 additions and 103 deletions
|
@ -8,6 +8,10 @@ class AltStylesController < ApplicationController
|
|||
@alt_styles = @alt_styles.merge(@species.alt_styles)
|
||||
end
|
||||
|
||||
# We're going to link to the HTML5 image URL, so make sure we have all the
|
||||
# manifests ready!
|
||||
SwfAsset.preload_manifests @alt_styles.map(&:swf_assets).flatten
|
||||
|
||||
respond_to do |format|
|
||||
format.html { render }
|
||||
format.json {
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
require 'async'
|
||||
require 'async/barrier'
|
||||
require 'async/semaphore'
|
||||
require 'fileutils'
|
||||
require 'uri'
|
||||
|
||||
|
@ -113,31 +116,39 @@ class SwfAsset < ApplicationRecord
|
|||
}
|
||||
end
|
||||
|
||||
MANIFEST_PATTERN = %r{^https://images.neopets.com/(?<prefix>.+)/(?<id>[0-9]+)(?<hash_part>_[^/]+)?/manifest\.json}
|
||||
def manifest
|
||||
raise "manifest_url is blank" if manifest_url.blank?
|
||||
NeopetsMediaArchive.load_json(manifest_url)
|
||||
end
|
||||
|
||||
def preload_manifest
|
||||
raise "manifest_url is blank" if manifest_url.blank?
|
||||
NeopetsMediaArchive.preload_file(manifest_url)
|
||||
end
|
||||
|
||||
MANIFEST_BASE_URL = Addressable::URI.parse("https://images.neopets.com")
|
||||
def manifest_asset_urls
|
||||
return {} if manifest_url.nil?
|
||||
|
||||
begin
|
||||
# Organize the asset URLs by file extension, grab the ones we want, and
|
||||
# convert them from paths to full URLs.
|
||||
manifest["cpmanifest"]["assets"][0]["asset_data"].
|
||||
to_h { |a| [a["file_ext"].to_sym, a] }.
|
||||
slice(:png, :svg, :js)
|
||||
.transform_values { |a| (MANIFEST_BASE_URL + a["url"]).to_s }
|
||||
rescue StandardError => error
|
||||
Rails.logger.error "Could not read URLs from manifest: #{error.full_message}"
|
||||
return {}
|
||||
end
|
||||
end
|
||||
|
||||
def html5_image_url
|
||||
return nil if manifest_url.nil?
|
||||
|
||||
# HACK: Just assuming all of these were well-formed by the same process,
|
||||
# and infer the image URL from the manifest URL! But strictly speaking we
|
||||
# should be reading the manifest to check!
|
||||
match = manifest_url.match(MANIFEST_PATTERN)
|
||||
return nil if match.nil?
|
||||
|
||||
"https://images.neopets.com/#{match[:prefix]}/" +
|
||||
"#{match[:id]}#{match[:hash_part]}/#{match[:id]}.png"
|
||||
manifest_asset_urls[:png]
|
||||
end
|
||||
|
||||
def html5_svg_url
|
||||
return nil if manifest_url.nil?
|
||||
|
||||
# HACK: Just assuming all of these were well-formed by the same process,
|
||||
# and infer the image URL from the manifest URL! But strictly speaking we
|
||||
# should be reading the manifest to check!
|
||||
match = manifest_url.match(MANIFEST_PATTERN)
|
||||
return nil if match.nil?
|
||||
|
||||
"https://images.neopets.com/#{match[:prefix]}/" +
|
||||
"#{match[:id]}#{match[:hash_part]}/#{match[:id]}.svg"
|
||||
manifest_asset_urls[:svg]
|
||||
end
|
||||
|
||||
def known_glitches
|
||||
|
@ -227,6 +238,38 @@ class SwfAsset < ApplicationRecord
|
|||
))
|
||||
end
|
||||
|
||||
# Given a list of SWF assets, ensure all of their manifests are loaded, with
|
||||
# fast concurrent execution!
|
||||
def self.preload_manifests(swf_assets)
|
||||
# Blocks all tasks beneath it.
|
||||
barrier = Async::Barrier.new
|
||||
|
||||
Sync do
|
||||
# Only allow 10 manifests to be loaded at a time.
|
||||
semaphore = Async::Semaphore.new(10, parent: barrier)
|
||||
|
||||
# Load all the manifests in async tasks. This will load them 10 at a time
|
||||
# rather than all at once (because of the semaphore), and the
|
||||
# NeopetsMediaArchive will share a pool of persistent connections for
|
||||
# them.
|
||||
swf_assets.map do |swf_asset|
|
||||
semaphore.async do
|
||||
begin
|
||||
swf_asset.preload_manifest
|
||||
rescue StandardError => error
|
||||
Rails.logger.error "Could not preload manifest for asset " +
|
||||
"#{swf_asset.id} (#{swf_asset.manifest_url}): #{error.message}"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Wait until all tasks are done.
|
||||
barrier.wait
|
||||
ensure
|
||||
barrier.stop # If something goes wrong, clean up all tasks.
|
||||
end
|
||||
end
|
||||
|
||||
before_save do
|
||||
# If an asset body ID changes, that means more than one body ID has been
|
||||
# linked to it, meaning that it's probably wearable by all bodies.
|
||||
|
|
120
app/services/neopets_media_archive.rb
Normal file
120
app/services/neopets_media_archive.rb
Normal file
|
@ -0,0 +1,120 @@
|
|||
require "addressable/uri"
|
||||
require "async/http/internet/instance"
|
||||
require "json"
|
||||
|
||||
# The Neopets Media Archive is a service that mirrors images.neopets.com files
|
||||
# locally. You can request a file from it, and we'll serve it from disk if we
|
||||
# have it, or request and save it if not.
|
||||
#
|
||||
# This is a bit different than a cache, because the intention is not just
|
||||
# optimization but that we *want* to be saving images.neopets.com as a
|
||||
# long-term archive, not dependent on their services having 100% uptime in
|
||||
# order for us to operate. We never discard old files, we just keep going!
|
||||
module NeopetsMediaArchive
|
||||
# Share a pool of persistent connections, rather than reconnecting on
|
||||
# each request. (This library does that automatically!)
|
||||
INTERNET = Async::HTTP::Internet.instance
|
||||
|
||||
ROOT_PATH = Pathname.new(Rails.configuration.neopets_media_archive_root)
|
||||
|
||||
# Load the file from the given `images.neopets.com` URI, as JSON.
|
||||
def self.load_json(uri)
|
||||
JSON.parse(load_file(uri))
|
||||
end
|
||||
|
||||
# Load the file from the given `images.neopets.com` URI.
|
||||
def self.load_file(uri, return_content: true)
|
||||
local_path = local_file_path(uri)
|
||||
|
||||
# Read the file locally if we have it.
|
||||
if return_content
|
||||
begin
|
||||
content = File.read(local_path)
|
||||
debug "Loaded source file from filesystem: #{local_path}"
|
||||
return content
|
||||
rescue Errno::ENOENT
|
||||
# If it doesn't exist, that's fine: just move on and download it.
|
||||
end
|
||||
else
|
||||
# When we don't need the content, "loading" the file is just ensuring
|
||||
# it exists. If it doesn't, we'll move on and load it from source.
|
||||
# (We use this when preloading files, to save the cost of reading files
|
||||
# we're not ready to use yet.)
|
||||
if File.exist?(local_path)
|
||||
debug "Source file is already loaded, skipping: #{local_path}"
|
||||
return
|
||||
end
|
||||
end
|
||||
|
||||
# Download the file from the origin, then save a copy for next time.
|
||||
content = load_file_from_origin(uri)
|
||||
info "Loaded source file from origin: #{uri}"
|
||||
local_path.dirname.mkpath
|
||||
File.write(local_path, content)
|
||||
info "Wrote source file to filesystem: #{local_path}"
|
||||
|
||||
return_content ? content : nil
|
||||
end
|
||||
|
||||
# Load the file from the given `images.neopets.com` URI, but don't return its
|
||||
# content. This can be faster in cases where the file's content isn't
|
||||
# relevant to us, and we just want to ensure it exists.
|
||||
def self.preload_file(uri)
|
||||
load_file(uri, return_content: false)
|
||||
end
|
||||
|
||||
# Load the file from the given `images.neopets.com` URI, directly from the
|
||||
# source, without checking the local filesystem.
|
||||
def self.load_file_from_origin(uri)
|
||||
unless Addressable::URI.parse(uri).origin == "https://images.neopets.com"
|
||||
raise ArgumentError, "NeopetsMediaArchive can only load from " +
|
||||
"https://images.neopets.com, but got #{uri}"
|
||||
end
|
||||
|
||||
# By running this request in a `Sync` block, we make this method look
|
||||
# synchronous to the caller—but if run in the context of an async task, it
|
||||
# will pause execution and move onto other work until the request is done.
|
||||
# We use this in the `swf_assets:manifests:load` task to perform many
|
||||
# requests in parallel!
|
||||
Sync do
|
||||
response = INTERNET.get(uri)
|
||||
if response.status == 404
|
||||
raise NotFound, "origin server returned 404: #{uri}"
|
||||
elsif response.status != 200
|
||||
raise "expected status 200 but got #{response.status} (#{uri})"
|
||||
end
|
||||
response.body.read
|
||||
end
|
||||
end
|
||||
|
||||
def self.path_within_archive(uri)
|
||||
uri = Addressable::URI.parse(uri)
|
||||
path = uri.host + uri.path
|
||||
|
||||
# We include the query string as part of the file path, which is a bit odd!
|
||||
# But Neopets often uses this for cache-busting, so we do need a mechanism
|
||||
# for knowing whether we're holding the right version of the file. We could
|
||||
# also consider storing the file by just its normal path, but with some
|
||||
# metadata to track versioning information (e.g. a sqlite db, or a metadata
|
||||
# file in the same directory).
|
||||
path += "?" + uri.query if !uri.query.nil? && !uri.query.empty?
|
||||
|
||||
path
|
||||
end
|
||||
|
||||
def self.local_file_path(uri)
|
||||
ROOT_PATH + path_within_archive(uri)
|
||||
end
|
||||
|
||||
class NotFound < StandardError; end
|
||||
|
||||
private
|
||||
|
||||
def self.info(message)
|
||||
Rails.logger.info "[NeopetsMediaArchive] #{message}"
|
||||
end
|
||||
|
||||
def self.debug(message)
|
||||
Rails.logger.debug "[NeopetsMediaArchive] #{message}"
|
||||
end
|
||||
end
|
|
@ -107,4 +107,9 @@ Rails.application.configure do
|
|||
# override this with the IMPRESS_2020_ORIGIN environment variable!)
|
||||
config.impress_2020_origin = ENV.fetch("IMPRESS_2020_ORIGIN",
|
||||
"http://localhost:4000")
|
||||
|
||||
# Save the Neopets Media Archive in the local `tmp` folder. (In production,
|
||||
# we keep this in a long-term location instead!)
|
||||
config.neopets_media_archive_root = Rails.root / "tmp" /
|
||||
"neopets_media_archive" / "development"
|
||||
end
|
||||
|
|
|
@ -126,4 +126,8 @@ Rails.application.configure do
|
|||
# IMPRESS_2020_ORIGIN environment variable!)
|
||||
config.impress_2020_origin = ENV.fetch("IMPRESS_2020_ORIGIN",
|
||||
"https://impress-2020.openneo.net")
|
||||
|
||||
# Save the Neopets Media Archive in `/var/lib/neopets-media-archive`, a
|
||||
# long-term storage location.
|
||||
config.neopets_media_archive_root = "/var/lib/neopets-media-archive"
|
||||
end
|
||||
|
|
|
@ -66,4 +66,9 @@ Rails.application.configure do
|
|||
# override this with the IMPRESS_2020_ORIGIN environment variable!)
|
||||
config.impress_2020_origin = ENV.fetch("IMPRESS_2020_ORIGIN",
|
||||
"http://localhost:4000")
|
||||
|
||||
# Save the Neopets Media Archive in the local `tmp` folder. (In production,
|
||||
# we keep this in a long-term location instead!)
|
||||
config.neopets_media_archive_root = Rails.root / "tmp" /
|
||||
"neopets_media_archive" / "test"
|
||||
end
|
||||
|
|
|
@ -402,6 +402,14 @@
|
|||
password: "{{ mysql_user_password_2020 }}"
|
||||
priv: "openneo_impress.*:ALL,openneo_id.*:ALL"
|
||||
|
||||
- name: Create the Neopets Media Archive data directory
|
||||
file:
|
||||
path: /var/lib/neopets-media-archive
|
||||
owner: impress
|
||||
group: impress
|
||||
mode: "755"
|
||||
state: directory
|
||||
|
||||
handlers:
|
||||
- name: Reload nginx
|
||||
systemd:
|
||||
|
|
|
@ -35,75 +35,104 @@ namespace :swf_assets do
|
|||
end
|
||||
end
|
||||
|
||||
desc "Backfill manifest_url for SwfAsset models"
|
||||
task manifests: [:environment] do
|
||||
timeout = ENV.fetch("TIMEOUT", "5").to_i
|
||||
namespace :manifests do
|
||||
desc "Save all known manifests to the Neopets Media Archive"
|
||||
task load: [:environment] do
|
||||
# Log errors to STDOUT, but we don't need the info messages about
|
||||
# successful saves.
|
||||
Rails.logger = Logger.new(STDOUT, level: :error)
|
||||
|
||||
assets = SwfAsset.where(manifest_url: nil)
|
||||
count = assets.count
|
||||
puts "Found #{count} assets without manifests"
|
||||
# Find all the manifests with known URLs. (We don't have a database
|
||||
# filter for "do we already have the manifest downloaded", but that's
|
||||
# okay, the preload method will quickly check that for us!)
|
||||
swf_assets = SwfAsset.where.not(manifest_url: nil)
|
||||
total_count = swf_assets.count
|
||||
puts "Found #{total_count} assets with manifests"
|
||||
|
||||
Async do
|
||||
# Share a pool of persistent connections, rather than reconnecting on
|
||||
# each request. (This library does that automatically!)
|
||||
internet = Async::HTTP::Internet.instance
|
||||
|
||||
# Load the assets in batches, then process each batch in two steps: first
|
||||
# inferring all manifest URLs in the batch, then saving all assets in the
|
||||
# batch. (This makes the update step more efficient, and it also avoids
|
||||
# simultaneous queries across the fibers, which ActiveRecord disallows!)
|
||||
#
|
||||
# We keep track of a shared index `i` here, but we only actually
|
||||
# increment it once each task is *done*, so that the numbers output in
|
||||
# the right order!
|
||||
i = 0
|
||||
assets.find_in_batches(batch_size: 1000) do |batch|
|
||||
# Create a barrier, to let us wait on all the tasks; then under it
|
||||
# create a semaphore, to limit how many tasks run at once.
|
||||
barrier = Async::Barrier.new
|
||||
semaphore = Async::Semaphore.new(100, parent: barrier)
|
||||
|
||||
batch.each do |asset|
|
||||
semaphore.async do |task|
|
||||
manifest_url = nil
|
||||
begin
|
||||
task.with_timeout(timeout) do
|
||||
manifest_url = infer_manifest_url(asset.url, internet)
|
||||
end
|
||||
rescue StandardError => error
|
||||
i += 1
|
||||
puts "[#{i}/#{count}] ⚠️ Skipping #{asset.id}: #{error.message}"
|
||||
next
|
||||
end
|
||||
|
||||
i += 1
|
||||
puts "[#{i}/#{count}] Manifest for #{asset.id}: #{manifest_url}"
|
||||
|
||||
# Write, but don't yet save, the manifest URL.
|
||||
asset.manifest_url = manifest_url
|
||||
end
|
||||
# For each batch of 1000 assets, load their manifests concurrently.
|
||||
# Wrap everything in a top-level sync, so keyboard interrupts will
|
||||
# propagate all the way up to here, instead of just cancelling the
|
||||
# current batch.
|
||||
Sync do
|
||||
saved_count = 0
|
||||
swf_assets.find_in_batches(batch_size: 1000) do |swf_assets|
|
||||
SwfAsset.preload_manifests(swf_assets)
|
||||
saved_count += swf_assets.size
|
||||
puts "Loaded #{saved_count} of #{total_count} manifests"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Wait for all the above tasks to finish. (Then, all of the assets that
|
||||
# succeeded should have an unsaved `manifest_url` change.)
|
||||
barrier.wait
|
||||
desc "Backfill manifest_url for SwfAsset models"
|
||||
task urls: [:environment] do
|
||||
timeout = ENV.fetch("TIMEOUT", "5").to_i
|
||||
|
||||
# Save all of the assets in the batch. (We do this in a transaction not
|
||||
# for the transactional semantics, but because it's notably faster than
|
||||
# doing a commit between each query, which is what sending the queries
|
||||
# individually would effectively do!)
|
||||
begin
|
||||
SwfAsset.transaction do
|
||||
batch.each do |asset|
|
||||
assets = SwfAsset.where(manifest_url: nil)
|
||||
count = assets.count
|
||||
puts "Found #{count} assets without manifests"
|
||||
|
||||
Sync do
|
||||
# Share a pool of persistent connections, rather than reconnecting on
|
||||
# each request. (This library does that automatically!)
|
||||
internet = Async::HTTP::Internet.instance
|
||||
|
||||
# Load the assets in batches, then process each batch in two steps: first
|
||||
# inferring all manifest URLs in the batch, then saving all assets in the
|
||||
# batch. (This makes the update step more efficient, and it also avoids
|
||||
# simultaneous queries across the fibers, which ActiveRecord disallows!)
|
||||
#
|
||||
# We keep track of a shared index `i` here, but we only actually
|
||||
# increment it once each task is *done*, so that the numbers output in
|
||||
# the right order!
|
||||
i = 0
|
||||
assets.find_in_batches(batch_size: 1000) do |batch|
|
||||
# Create a barrier, to let us wait on all the tasks; then under it
|
||||
# create a semaphore, to limit how many tasks run at once.
|
||||
barrier = Async::Barrier.new
|
||||
semaphore = Async::Semaphore.new(100, parent: barrier)
|
||||
|
||||
batch.each do |asset|
|
||||
semaphore.async do |task|
|
||||
manifest_url = nil
|
||||
begin
|
||||
asset.save!
|
||||
task.with_timeout(timeout) do
|
||||
manifest_url = infer_manifest_url(asset.url, internet)
|
||||
end
|
||||
rescue StandardError => error
|
||||
puts "⚠️ Saving asset #{asset.id} failed: #{error.full_message}"
|
||||
i += 1
|
||||
puts "[#{i}/#{count}] ⚠️ Skipping #{asset.id}: #{error.message}"
|
||||
next
|
||||
end
|
||||
|
||||
i += 1
|
||||
puts "[#{i}/#{count}] Manifest for #{asset.id}: #{manifest_url}"
|
||||
|
||||
# Write, but don't yet save, the manifest URL.
|
||||
asset.manifest_url = manifest_url
|
||||
end
|
||||
end
|
||||
rescue StandardError => error
|
||||
puts "⚠️ Saving this batch failed: #{error.full_message}"
|
||||
|
||||
# Wait for all the above tasks to finish. (Then, all of the assets that
|
||||
# succeeded should have an unsaved `manifest_url` change.)
|
||||
barrier.wait
|
||||
|
||||
# Save all of the assets in the batch. (We do this in a transaction not
|
||||
# for the transactional semantics, but because it's notably faster than
|
||||
# doing a commit between each query, which is what sending the queries
|
||||
# individually would effectively do!)
|
||||
begin
|
||||
SwfAsset.transaction do
|
||||
batch.each do |asset|
|
||||
begin
|
||||
asset.save!
|
||||
rescue StandardError => error
|
||||
puts "⚠️ Saving asset #{asset.id} failed: #{error.full_message}"
|
||||
end
|
||||
end
|
||||
end
|
||||
rescue StandardError => error
|
||||
puts "⚠️ Saving this batch failed: #{error.full_message}"
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -112,29 +141,29 @@ end
|
|||
|
||||
SWF_URL_PATTERN = %r{^(?:https?:)?//images\.neopets\.com/cp/(bio|items)/swf/(.+?)_([a-z0-9]+)\.swf$}
|
||||
def infer_manifest_url(swf_url, internet)
|
||||
url_match = swf_url.match(SWF_URL_PATTERN)
|
||||
raise ArgumentError, "not a valid SWF URL: #{swf_url}" if url_match.nil?
|
||||
url_match = swf_url.match(SWF_URL_PATTERN)
|
||||
raise ArgumentError, "not a valid SWF URL: #{swf_url}" if url_match.nil?
|
||||
|
||||
# Build the potential manifest URLs, from the two structures we know of.
|
||||
type, folders, hash_str = url_match.captures
|
||||
potential_manifest_urls = [
|
||||
"https://images.neopets.com/cp/#{type}/data/#{folders}/manifest.json",
|
||||
"https://images.neopets.com/cp/#{type}/data/#{folders}_#{hash_str}/manifest.json",
|
||||
]
|
||||
# Build the potential manifest URLs, from the two structures we know of.
|
||||
type, folders, hash_str = url_match.captures
|
||||
potential_manifest_urls = [
|
||||
"https://images.neopets.com/cp/#{type}/data/#{folders}/manifest.json",
|
||||
"https://images.neopets.com/cp/#{type}/data/#{folders}_#{hash_str}/manifest.json",
|
||||
]
|
||||
|
||||
# Send a HEAD request to test each manifest URL, without downloading its
|
||||
# content. If it succeeds, we're done!
|
||||
potential_manifest_urls.each do |potential_manifest_url|
|
||||
res = internet.head potential_manifest_url
|
||||
if res.ok?
|
||||
return potential_manifest_url
|
||||
elsif res.status == 404
|
||||
next # Ok, this was not the manifest!
|
||||
else
|
||||
raise "unexpected manifest response code: #{res.status}"
|
||||
end
|
||||
end
|
||||
# Send a HEAD request to test each manifest URL, without downloading its
|
||||
# content. If it succeeds, we're done!
|
||||
potential_manifest_urls.each do |potential_manifest_url|
|
||||
res = internet.head potential_manifest_url
|
||||
if res.ok?
|
||||
return potential_manifest_url
|
||||
elsif res.status == 404
|
||||
next # Ok, this was not the manifest!
|
||||
else
|
||||
raise "unexpected manifest response code: #{res.status}"
|
||||
end
|
||||
end
|
||||
|
||||
# Otherwise, there's no valid manifest URL.
|
||||
raise "all of the common manifest URL patterns returned HTTP 404"
|
||||
# Otherwise, there's no valid manifest URL.
|
||||
raise "all of the common manifest URL patterns returned HTTP 404"
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue