Move manifest backfill to swf_assets:manifests
task
Okay, I've simplified the migration to *just* add the column, and instead added a task to find assets without manifest URLs and backfill them. Performance is a lot better now, using the `async-http` library, which as I understand it supports both persistent connections when invoked like this, and maybe also HTTP/2 multiplexing?? (Though I'm not actually sure images.neopets.com does lol) I'm not sure about the number of concurrent tasks I picked here, 100 seems okay for an internet thing and for such small requests, but I worry that the CDN is gonna get annoyed or something. Well, we'll see! This task is very resumable if it turns out we get frozen out or something.
This commit is contained in:
parent
d35b10c1b8
commit
dc22a458bf
19 changed files with 134 additions and 50 deletions
5
Gemfile
5
Gemfile
|
@ -58,6 +58,11 @@ gem 'parallel', '~> 1.23'
|
||||||
gem "httparty", "~> 0.21.0"
|
gem "httparty", "~> 0.21.0"
|
||||||
gem "addressable", "~> 2.8"
|
gem "addressable", "~> 2.8"
|
||||||
|
|
||||||
|
# For advanced batching of many HTTP requests.
|
||||||
|
gem "async", "~> 2.6", require: false
|
||||||
|
gem "async-http", "~> 0.61.0", require: false
|
||||||
|
gem "thread-local", "~> 1.1", require: false
|
||||||
|
|
||||||
# For debugging.
|
# For debugging.
|
||||||
gem 'web-console', '~> 4.2', group: :development
|
gem 'web-console', '~> 4.2', group: :development
|
||||||
|
|
||||||
|
|
36
Gemfile.lock
36
Gemfile.lock
|
@ -82,6 +82,23 @@ GEM
|
||||||
tzinfo (~> 2.0)
|
tzinfo (~> 2.0)
|
||||||
addressable (2.8.4)
|
addressable (2.8.4)
|
||||||
public_suffix (>= 2.0.2, < 6.0)
|
public_suffix (>= 2.0.2, < 6.0)
|
||||||
|
async (2.6.5)
|
||||||
|
console (~> 1.10)
|
||||||
|
fiber-annotation
|
||||||
|
io-event (~> 1.1)
|
||||||
|
timers (~> 4.1)
|
||||||
|
async-http (0.61.0)
|
||||||
|
async (>= 1.25)
|
||||||
|
async-io (>= 1.28)
|
||||||
|
async-pool (>= 0.2)
|
||||||
|
protocol-http (~> 0.25.0)
|
||||||
|
protocol-http1 (~> 0.16.0)
|
||||||
|
protocol-http2 (~> 0.15.0)
|
||||||
|
traces (>= 0.10.0)
|
||||||
|
async-io (1.37.0)
|
||||||
|
async
|
||||||
|
async-pool (0.4.0)
|
||||||
|
async (>= 1.25)
|
||||||
babel-source (5.8.35)
|
babel-source (5.8.35)
|
||||||
babel-transpiler (0.7.0)
|
babel-transpiler (0.7.0)
|
||||||
babel-source (>= 4.0, < 6)
|
babel-source (>= 4.0, < 6)
|
||||||
|
@ -95,6 +112,9 @@ GEM
|
||||||
builder (3.2.4)
|
builder (3.2.4)
|
||||||
concurrent-ruby (1.2.2)
|
concurrent-ruby (1.2.2)
|
||||||
connection_pool (2.2.5)
|
connection_pool (2.2.5)
|
||||||
|
console (1.23.2)
|
||||||
|
fiber-annotation
|
||||||
|
fiber-local
|
||||||
crass (1.0.6)
|
crass (1.0.6)
|
||||||
date (3.3.3)
|
date (3.3.3)
|
||||||
devise (4.9.3)
|
devise (4.9.3)
|
||||||
|
@ -116,6 +136,8 @@ GEM
|
||||||
erubi (1.12.0)
|
erubi (1.12.0)
|
||||||
execjs (2.5.2)
|
execjs (2.5.2)
|
||||||
ffi (1.15.5)
|
ffi (1.15.5)
|
||||||
|
fiber-annotation (0.2.0)
|
||||||
|
fiber-local (1.0.0)
|
||||||
globalid (1.2.1)
|
globalid (1.2.1)
|
||||||
activesupport (>= 6.1)
|
activesupport (>= 6.1)
|
||||||
globalize (6.3.0)
|
globalize (6.3.0)
|
||||||
|
@ -136,6 +158,7 @@ GEM
|
||||||
i18n (1.14.1)
|
i18n (1.14.1)
|
||||||
concurrent-ruby (~> 1.0)
|
concurrent-ruby (~> 1.0)
|
||||||
io-console (0.6.0)
|
io-console (0.6.0)
|
||||||
|
io-event (1.3.3)
|
||||||
irb (1.8.3)
|
irb (1.8.3)
|
||||||
rdoc
|
rdoc
|
||||||
reline (>= 0.3.8)
|
reline (>= 0.3.8)
|
||||||
|
@ -181,6 +204,13 @@ GEM
|
||||||
racc (~> 1.4)
|
racc (~> 1.4)
|
||||||
orm_adapter (0.5.0)
|
orm_adapter (0.5.0)
|
||||||
parallel (1.23.0)
|
parallel (1.23.0)
|
||||||
|
protocol-hpack (1.4.2)
|
||||||
|
protocol-http (0.25.0)
|
||||||
|
protocol-http1 (0.16.0)
|
||||||
|
protocol-http (~> 0.22)
|
||||||
|
protocol-http2 (0.15.1)
|
||||||
|
protocol-hpack (~> 1.4)
|
||||||
|
protocol-http (~> 0.18)
|
||||||
psych (5.1.1.1)
|
psych (5.1.1.1)
|
||||||
stringio
|
stringio
|
||||||
public_suffix (5.0.3)
|
public_suffix (5.0.3)
|
||||||
|
@ -287,8 +317,11 @@ GEM
|
||||||
terser (1.1.17)
|
terser (1.1.17)
|
||||||
execjs (>= 0.3.0, < 3)
|
execjs (>= 0.3.0, < 3)
|
||||||
thor (1.2.2)
|
thor (1.2.2)
|
||||||
|
thread-local (1.1.0)
|
||||||
tilt (2.2.0)
|
tilt (2.2.0)
|
||||||
timeout (0.4.0)
|
timeout (0.4.0)
|
||||||
|
timers (4.3.5)
|
||||||
|
traces (0.11.1)
|
||||||
tzinfo (2.0.6)
|
tzinfo (2.0.6)
|
||||||
concurrent-ruby (~> 1.0)
|
concurrent-ruby (~> 1.0)
|
||||||
unf (0.1.4)
|
unf (0.1.4)
|
||||||
|
@ -314,6 +347,8 @@ PLATFORMS
|
||||||
DEPENDENCIES
|
DEPENDENCIES
|
||||||
RocketAMF!
|
RocketAMF!
|
||||||
addressable (~> 2.8)
|
addressable (~> 2.8)
|
||||||
|
async (~> 2.6)
|
||||||
|
async-http (~> 0.61.0)
|
||||||
bootsnap (~> 1.16)
|
bootsnap (~> 1.16)
|
||||||
devise (~> 4.9, >= 4.9.2)
|
devise (~> 4.9, >= 4.9.2)
|
||||||
devise-encryptable (~> 0.2.0)
|
devise-encryptable (~> 0.2.0)
|
||||||
|
@ -344,6 +379,7 @@ DEPENDENCIES
|
||||||
sprockets (~> 4.2)
|
sprockets (~> 4.2)
|
||||||
stackprof (~> 0.2.25)
|
stackprof (~> 0.2.25)
|
||||||
terser (~> 1.1, >= 1.1.17)
|
terser (~> 1.1, >= 1.1.17)
|
||||||
|
thread-local (~> 1.1)
|
||||||
web-console (~> 4.2)
|
web-console (~> 4.2)
|
||||||
will_paginate (~> 4.0)
|
will_paginate (~> 4.0)
|
||||||
|
|
||||||
|
|
|
@ -1,55 +1,5 @@
|
||||||
class AddManifestUrlToSwfAssets < ActiveRecord::Migration[7.1]
|
class AddManifestUrlToSwfAssets < ActiveRecord::Migration[7.1]
|
||||||
def change
|
def change
|
||||||
add_column :swf_assets, :manifest_url, :string
|
add_column :swf_assets, :manifest_url, :string
|
||||||
|
|
||||||
# Okay, this is a big one to run upward! We're going to infer the manifest
|
|
||||||
# for as many assets as we can!
|
|
||||||
reversible do |direction|
|
|
||||||
direction.up do
|
|
||||||
Net::HTTP.start("images.neopets.com", 443, use_ssl: true) do |http|
|
|
||||||
SwfAsset.find_each do |swf_asset|
|
|
||||||
begin
|
|
||||||
manifest_url = infer_manifest_url(http, swf_asset.url)
|
|
||||||
rescue StandardError => error
|
|
||||||
Rails.logger.warn "Could not infer manifest URL for #{swf_asset.id}, skipping: #{error.message}"
|
|
||||||
next
|
|
||||||
end
|
|
||||||
|
|
||||||
Rails.logger.info "#{swf_asset.id}: #{manifest_url}"
|
|
||||||
swf_asset.manifest_url = manifest_url
|
|
||||||
swf_asset.save!
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
SWF_URL_PATTERN = %r{^(?:https?:)?//images\.neopets\.com/cp/(bio|items)/swf/(.+?)_([a-z0-9]+)\.swf$}
|
|
||||||
def infer_manifest_url(http, swf_url)
|
|
||||||
url_match = swf_url.match(SWF_URL_PATTERN)
|
|
||||||
raise ArgumentError, "not a valid SWF URL: #{swf_url}" if url_match.nil?
|
|
||||||
|
|
||||||
# Build the potential manifest URLs, from the two structures we know of.
|
|
||||||
type, folders, hash_str = url_match.captures
|
|
||||||
potential_manifest_urls = [
|
|
||||||
"https://images.neopets.com/cp/#{type}/data/#{folders}/manifest.json",
|
|
||||||
"https://images.neopets.com/cp/#{type}/data/#{folders}_#{hash_str}/manifest.json",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Send a HEAD request to test each manifest URL, without downloading its
|
|
||||||
# content. If it succeeds, we're done!
|
|
||||||
potential_manifest_urls.each do |potential_manifest_url|
|
|
||||||
res = http.head potential_manifest_url
|
|
||||||
if res.is_a? Net::HTTPOK
|
|
||||||
return potential_manifest_url
|
|
||||||
elsif res.is_a? Net::HTTPNotFound
|
|
||||||
next
|
|
||||||
else
|
|
||||||
raise "unexpected manifest response code: #{res.code}"
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# Otherwise, there's no valid manifest URL.
|
|
||||||
raise "none of the common manifest URL patterns returned HTTP 200"
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
93
lib/tasks/swf_assets.rake
Normal file
93
lib/tasks/swf_assets.rake
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
require 'async/barrier'
|
||||||
|
require 'async/http/internet/instance'
|
||||||
|
|
||||||
|
namespace :swf_assets do
|
||||||
|
desc "Backfill manifest_url for SwfAsset models"
|
||||||
|
task manifests: [:environment] do
|
||||||
|
assets = SwfAsset.where(manifest_url: nil)
|
||||||
|
count = assets.count
|
||||||
|
puts "Found #{count} assets without manifests"
|
||||||
|
|
||||||
|
Async do
|
||||||
|
# Share a pool of persistent connections, rather than reconnecting on
|
||||||
|
# each request. (This library does that automatically!)
|
||||||
|
internet = Async::HTTP::Internet.instance
|
||||||
|
|
||||||
|
# Load the assets in batches, then process each batch in two steps: first
|
||||||
|
# inferring all manifest URLs in the batch, then saving all assets in the
|
||||||
|
# batch. (This makes the update step more efficient, and it also avoids
|
||||||
|
# simultaneous queries across the fibers, which ActiveRecord disallows!)
|
||||||
|
#
|
||||||
|
# We keep track of a shared index `i` here, but we only actually
|
||||||
|
# increment it once each task is *done*, so that the numbers output in
|
||||||
|
# the right order!
|
||||||
|
i = 0
|
||||||
|
assets.find_in_batches(batch_size: 1000) do |batch|
|
||||||
|
# Create a barrier, to let us wait on all the tasks; then under it
|
||||||
|
# create a semaphore, to limit how many tasks run at once.
|
||||||
|
barrier = Async::Barrier.new
|
||||||
|
semaphore = Async::Semaphore.new(100, parent: barrier)
|
||||||
|
|
||||||
|
batch.each do |asset|
|
||||||
|
semaphore.async do |task|
|
||||||
|
manifest_url = nil
|
||||||
|
begin
|
||||||
|
task.with_timeout(5) do
|
||||||
|
manifest_url = infer_manifest_url(asset.url, internet)
|
||||||
|
end
|
||||||
|
rescue StandardError => error
|
||||||
|
i += 1
|
||||||
|
puts "[#{i}/#{count}] ⚠️ Skipping #{asset.id}: #{error.message}"
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
puts "[#{i}/#{count}] Manifest for #{asset.id}: #{manifest_url}"
|
||||||
|
|
||||||
|
# Write, but don't yet save, the manifest URL.
|
||||||
|
asset.manifest_url = manifest_url
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Wait for all the above tasks to finish. (Then, all of the assets that
|
||||||
|
# succeeded should have an unsaved `manifest_url` change.)
|
||||||
|
barrier.wait
|
||||||
|
|
||||||
|
# Save all of the assets in the batch. (We do this in a transaction not
|
||||||
|
# for the transactional semantics, but because it's notably faster than
|
||||||
|
# doing a commit between each query, which is what sending the queries
|
||||||
|
# individually would effectively do!)
|
||||||
|
SwfAsset.transaction { batch.each(&:save!) }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
SWF_URL_PATTERN = %r{^(?:https?:)?//images\.neopets\.com/cp/(bio|items)/swf/(.+?)_([a-z0-9]+)\.swf$}
|
||||||
|
def infer_manifest_url(swf_url, internet)
|
||||||
|
url_match = swf_url.match(SWF_URL_PATTERN)
|
||||||
|
raise ArgumentError, "not a valid SWF URL: #{swf_url}" if url_match.nil?
|
||||||
|
|
||||||
|
# Build the potential manifest URLs, from the two structures we know of.
|
||||||
|
type, folders, hash_str = url_match.captures
|
||||||
|
potential_manifest_urls = [
|
||||||
|
"https://images.neopets.com/cp/#{type}/data/#{folders}/manifest.json",
|
||||||
|
"https://images.neopets.com/cp/#{type}/data/#{folders}_#{hash_str}/manifest.json",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Send a HEAD request to test each manifest URL, without downloading its
|
||||||
|
# content. If it succeeds, we're done!
|
||||||
|
potential_manifest_urls.each do |potential_manifest_url|
|
||||||
|
res = internet.head potential_manifest_url
|
||||||
|
if res.ok?
|
||||||
|
return potential_manifest_url
|
||||||
|
elsif res.status == 404
|
||||||
|
next # Ok, this was not the manifest!
|
||||||
|
else
|
||||||
|
raise "unexpected manifest response code: #{res.status}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Otherwise, there's no valid manifest URL.
|
||||||
|
raise "none of the common manifest URL patterns returned HTTP 200"
|
||||||
|
end
|
BIN
vendor/cache/async-2.6.5.gem
vendored
Normal file
BIN
vendor/cache/async-2.6.5.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/async-http-0.61.0.gem
vendored
Normal file
BIN
vendor/cache/async-http-0.61.0.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/async-io-1.37.0.gem
vendored
Normal file
BIN
vendor/cache/async-io-1.37.0.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/async-pool-0.4.0.gem
vendored
Normal file
BIN
vendor/cache/async-pool-0.4.0.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/console-1.23.2.gem
vendored
Normal file
BIN
vendor/cache/console-1.23.2.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/fiber-annotation-0.2.0.gem
vendored
Normal file
BIN
vendor/cache/fiber-annotation-0.2.0.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/fiber-local-1.0.0.gem
vendored
Normal file
BIN
vendor/cache/fiber-local-1.0.0.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/io-event-1.3.3.gem
vendored
Normal file
BIN
vendor/cache/io-event-1.3.3.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/protocol-hpack-1.4.2.gem
vendored
Normal file
BIN
vendor/cache/protocol-hpack-1.4.2.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/protocol-http-0.25.0.gem
vendored
Normal file
BIN
vendor/cache/protocol-http-0.25.0.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/protocol-http1-0.16.0.gem
vendored
Normal file
BIN
vendor/cache/protocol-http1-0.16.0.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/protocol-http2-0.15.1.gem
vendored
Normal file
BIN
vendor/cache/protocol-http2-0.15.1.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/thread-local-1.1.0.gem
vendored
Normal file
BIN
vendor/cache/thread-local-1.1.0.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/timers-4.3.5.gem
vendored
Normal file
BIN
vendor/cache/timers-4.3.5.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/traces-0.11.1.gem
vendored
Normal file
BIN
vendor/cache/traces-0.11.1.gem
vendored
Normal file
Binary file not shown.
Loading…
Reference in a new issue