Move manifest backfill to swf_assets:manifests task

Okay, I've simplified the migration to *just* add the column, and
instead added a task to find assets without manifest URLs and backfill
them.

Performance is a lot better now, using the `async-http` library, which
as I understand it supports both persistent connections when invoked
like this, and maybe also HTTP/2 multiplexing?? (Though I'm not
actually sure images.neopets.com does lol)

I'm not sure about the number of concurrent tasks I picked here, 100
seems okay for an internet thing and for such small requests, but I
worry that the CDN is gonna get annoyed or something. Well, we'll see!
This task is very resumable if it turns out we get frozen out or
something.
This commit is contained in:
Emi Matchu 2023-11-10 16:52:50 -08:00
parent d35b10c1b8
commit dc22a458bf
19 changed files with 134 additions and 50 deletions

View file

@ -58,6 +58,11 @@ gem 'parallel', '~> 1.23'
gem "httparty", "~> 0.21.0"
gem "addressable", "~> 2.8"
# For advanced batching of many HTTP requests.
gem "async", "~> 2.6", require: false
gem "async-http", "~> 0.61.0", require: false
gem "thread-local", "~> 1.1", require: false
# For debugging.
gem 'web-console', '~> 4.2', group: :development

View file

@ -82,6 +82,23 @@ GEM
tzinfo (~> 2.0)
addressable (2.8.4)
public_suffix (>= 2.0.2, < 6.0)
async (2.6.5)
console (~> 1.10)
fiber-annotation
io-event (~> 1.1)
timers (~> 4.1)
async-http (0.61.0)
async (>= 1.25)
async-io (>= 1.28)
async-pool (>= 0.2)
protocol-http (~> 0.25.0)
protocol-http1 (~> 0.16.0)
protocol-http2 (~> 0.15.0)
traces (>= 0.10.0)
async-io (1.37.0)
async
async-pool (0.4.0)
async (>= 1.25)
babel-source (5.8.35)
babel-transpiler (0.7.0)
babel-source (>= 4.0, < 6)
@ -95,6 +112,9 @@ GEM
builder (3.2.4)
concurrent-ruby (1.2.2)
connection_pool (2.2.5)
console (1.23.2)
fiber-annotation
fiber-local
crass (1.0.6)
date (3.3.3)
devise (4.9.3)
@ -116,6 +136,8 @@ GEM
erubi (1.12.0)
execjs (2.5.2)
ffi (1.15.5)
fiber-annotation (0.2.0)
fiber-local (1.0.0)
globalid (1.2.1)
activesupport (>= 6.1)
globalize (6.3.0)
@ -136,6 +158,7 @@ GEM
i18n (1.14.1)
concurrent-ruby (~> 1.0)
io-console (0.6.0)
io-event (1.3.3)
irb (1.8.3)
rdoc
reline (>= 0.3.8)
@ -181,6 +204,13 @@ GEM
racc (~> 1.4)
orm_adapter (0.5.0)
parallel (1.23.0)
protocol-hpack (1.4.2)
protocol-http (0.25.0)
protocol-http1 (0.16.0)
protocol-http (~> 0.22)
protocol-http2 (0.15.1)
protocol-hpack (~> 1.4)
protocol-http (~> 0.18)
psych (5.1.1.1)
stringio
public_suffix (5.0.3)
@ -287,8 +317,11 @@ GEM
terser (1.1.17)
execjs (>= 0.3.0, < 3)
thor (1.2.2)
thread-local (1.1.0)
tilt (2.2.0)
timeout (0.4.0)
timers (4.3.5)
traces (0.11.1)
tzinfo (2.0.6)
concurrent-ruby (~> 1.0)
unf (0.1.4)
@ -314,6 +347,8 @@ PLATFORMS
DEPENDENCIES
RocketAMF!
addressable (~> 2.8)
async (~> 2.6)
async-http (~> 0.61.0)
bootsnap (~> 1.16)
devise (~> 4.9, >= 4.9.2)
devise-encryptable (~> 0.2.0)
@ -344,6 +379,7 @@ DEPENDENCIES
sprockets (~> 4.2)
stackprof (~> 0.2.25)
terser (~> 1.1, >= 1.1.17)
thread-local (~> 1.1)
web-console (~> 4.2)
will_paginate (~> 4.0)

View file

@ -1,55 +1,5 @@
class AddManifestUrlToSwfAssets < ActiveRecord::Migration[7.1]
def change
add_column :swf_assets, :manifest_url, :string
# Okay, this is a big one to run upward! We're going to infer the manifest
# for as many assets as we can!
reversible do |direction|
direction.up do
Net::HTTP.start("images.neopets.com", 443, use_ssl: true) do |http|
SwfAsset.find_each do |swf_asset|
begin
manifest_url = infer_manifest_url(http, swf_asset.url)
rescue StandardError => error
Rails.logger.warn "Could not infer manifest URL for #{swf_asset.id}, skipping: #{error.message}"
next
end
Rails.logger.info "#{swf_asset.id}: #{manifest_url}"
swf_asset.manifest_url = manifest_url
swf_asset.save!
end
end
end
end
end
SWF_URL_PATTERN = %r{^(?:https?:)?//images\.neopets\.com/cp/(bio|items)/swf/(.+?)_([a-z0-9]+)\.swf$}
def infer_manifest_url(http, swf_url)
url_match = swf_url.match(SWF_URL_PATTERN)
raise ArgumentError, "not a valid SWF URL: #{swf_url}" if url_match.nil?
# Build the potential manifest URLs, from the two structures we know of.
type, folders, hash_str = url_match.captures
potential_manifest_urls = [
"https://images.neopets.com/cp/#{type}/data/#{folders}/manifest.json",
"https://images.neopets.com/cp/#{type}/data/#{folders}_#{hash_str}/manifest.json",
]
# Send a HEAD request to test each manifest URL, without downloading its
# content. If it succeeds, we're done!
potential_manifest_urls.each do |potential_manifest_url|
res = http.head potential_manifest_url
if res.is_a? Net::HTTPOK
return potential_manifest_url
elsif res.is_a? Net::HTTPNotFound
next
else
raise "unexpected manifest response code: #{res.code}"
end
end
# Otherwise, there's no valid manifest URL.
raise "none of the common manifest URL patterns returned HTTP 200"
end
end

93
lib/tasks/swf_assets.rake Normal file
View file

@ -0,0 +1,93 @@
require 'async/barrier'
require 'async/http/internet/instance'
namespace :swf_assets do
desc "Backfill manifest_url for SwfAsset models"
task manifests: [:environment] do
assets = SwfAsset.where(manifest_url: nil)
count = assets.count
puts "Found #{count} assets without manifests"
Async do
# Share a pool of persistent connections, rather than reconnecting on
# each request. (This library does that automatically!)
internet = Async::HTTP::Internet.instance
# Load the assets in batches, then process each batch in two steps: first
# inferring all manifest URLs in the batch, then saving all assets in the
# batch. (This makes the update step more efficient, and it also avoids
# simultaneous queries across the fibers, which ActiveRecord disallows!)
#
# We keep track of a shared index `i` here, but we only actually
# increment it once each task is *done*, so that the numbers output in
# the right order!
i = 0
assets.find_in_batches(batch_size: 1000) do |batch|
# Create a barrier, to let us wait on all the tasks; then under it
# create a semaphore, to limit how many tasks run at once.
barrier = Async::Barrier.new
semaphore = Async::Semaphore.new(100, parent: barrier)
batch.each do |asset|
semaphore.async do |task|
manifest_url = nil
begin
task.with_timeout(5) do
manifest_url = infer_manifest_url(asset.url, internet)
end
rescue StandardError => error
i += 1
puts "[#{i}/#{count}] ⚠️ Skipping #{asset.id}: #{error.message}"
next
end
i += 1
puts "[#{i}/#{count}] Manifest for #{asset.id}: #{manifest_url}"
# Write, but don't yet save, the manifest URL.
asset.manifest_url = manifest_url
end
end
# Wait for all the above tasks to finish. (Then, all of the assets that
# succeeded should have an unsaved `manifest_url` change.)
barrier.wait
# Save all of the assets in the batch. (We do this in a transaction not
# for the transactional semantics, but because it's notably faster than
# doing a commit between each query, which is what sending the queries
# individually would effectively do!)
SwfAsset.transaction { batch.each(&:save!) }
end
end
end
end
SWF_URL_PATTERN = %r{^(?:https?:)?//images\.neopets\.com/cp/(bio|items)/swf/(.+?)_([a-z0-9]+)\.swf$}
def infer_manifest_url(swf_url, internet)
url_match = swf_url.match(SWF_URL_PATTERN)
raise ArgumentError, "not a valid SWF URL: #{swf_url}" if url_match.nil?
# Build the potential manifest URLs, from the two structures we know of.
type, folders, hash_str = url_match.captures
potential_manifest_urls = [
"https://images.neopets.com/cp/#{type}/data/#{folders}/manifest.json",
"https://images.neopets.com/cp/#{type}/data/#{folders}_#{hash_str}/manifest.json",
]
# Send a HEAD request to test each manifest URL, without downloading its
# content. If it succeeds, we're done!
potential_manifest_urls.each do |potential_manifest_url|
res = internet.head potential_manifest_url
if res.ok?
return potential_manifest_url
elsif res.status == 404
next # Ok, this was not the manifest!
else
raise "unexpected manifest response code: #{res.status}"
end
end
# Otherwise, there's no valid manifest URL.
raise "none of the common manifest URL patterns returned HTTP 200"
end

BIN
vendor/cache/async-2.6.5.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/async-http-0.61.0.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/async-io-1.37.0.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/async-pool-0.4.0.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/console-1.23.2.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/fiber-annotation-0.2.0.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/fiber-local-1.0.0.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/io-event-1.3.3.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/protocol-hpack-1.4.2.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/protocol-http-0.25.0.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/protocol-http1-0.16.0.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/protocol-http2-0.15.1.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/thread-local-1.1.0.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/timers-4.3.5.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/traces-0.11.1.gem vendored Normal file

Binary file not shown.