diff --git a/Gemfile b/Gemfile index 34b6b06b..3ee5010b 100644 --- a/Gemfile +++ b/Gemfile @@ -58,6 +58,11 @@ gem 'parallel', '~> 1.23' gem "httparty", "~> 0.21.0" gem "addressable", "~> 2.8" +# For advanced batching of many HTTP requests. +gem "async", "~> 2.6", require: false +gem "async-http", "~> 0.61.0", require: false +gem "thread-local", "~> 1.1", require: false + # For debugging. gem 'web-console', '~> 4.2', group: :development diff --git a/Gemfile.lock b/Gemfile.lock index 6dad04d4..3a334dd6 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -82,6 +82,23 @@ GEM tzinfo (~> 2.0) addressable (2.8.4) public_suffix (>= 2.0.2, < 6.0) + async (2.6.5) + console (~> 1.10) + fiber-annotation + io-event (~> 1.1) + timers (~> 4.1) + async-http (0.61.0) + async (>= 1.25) + async-io (>= 1.28) + async-pool (>= 0.2) + protocol-http (~> 0.25.0) + protocol-http1 (~> 0.16.0) + protocol-http2 (~> 0.15.0) + traces (>= 0.10.0) + async-io (1.37.0) + async + async-pool (0.4.0) + async (>= 1.25) babel-source (5.8.35) babel-transpiler (0.7.0) babel-source (>= 4.0, < 6) @@ -95,6 +112,9 @@ GEM builder (3.2.4) concurrent-ruby (1.2.2) connection_pool (2.2.5) + console (1.23.2) + fiber-annotation + fiber-local crass (1.0.6) date (3.3.3) devise (4.9.3) @@ -116,6 +136,8 @@ GEM erubi (1.12.0) execjs (2.5.2) ffi (1.15.5) + fiber-annotation (0.2.0) + fiber-local (1.0.0) globalid (1.2.1) activesupport (>= 6.1) globalize (6.3.0) @@ -136,6 +158,7 @@ GEM i18n (1.14.1) concurrent-ruby (~> 1.0) io-console (0.6.0) + io-event (1.3.3) irb (1.8.3) rdoc reline (>= 0.3.8) @@ -181,6 +204,13 @@ GEM racc (~> 1.4) orm_adapter (0.5.0) parallel (1.23.0) + protocol-hpack (1.4.2) + protocol-http (0.25.0) + protocol-http1 (0.16.0) + protocol-http (~> 0.22) + protocol-http2 (0.15.1) + protocol-hpack (~> 1.4) + protocol-http (~> 0.18) psych (5.1.1.1) stringio public_suffix (5.0.3) @@ -287,8 +317,11 @@ GEM terser (1.1.17) execjs (>= 0.3.0, < 3) thor (1.2.2) + thread-local (1.1.0) tilt (2.2.0) timeout (0.4.0) + timers (4.3.5) + traces (0.11.1) tzinfo (2.0.6) concurrent-ruby (~> 1.0) unf (0.1.4) @@ -314,6 +347,8 @@ PLATFORMS DEPENDENCIES RocketAMF! addressable (~> 2.8) + async (~> 2.6) + async-http (~> 0.61.0) bootsnap (~> 1.16) devise (~> 4.9, >= 4.9.2) devise-encryptable (~> 0.2.0) @@ -344,6 +379,7 @@ DEPENDENCIES sprockets (~> 4.2) stackprof (~> 0.2.25) terser (~> 1.1, >= 1.1.17) + thread-local (~> 1.1) web-console (~> 4.2) will_paginate (~> 4.0) diff --git a/db/migrate/20231110043543_add_manifest_url_to_swf_assets.rb b/db/migrate/20231110043543_add_manifest_url_to_swf_assets.rb index e357af1b..93afa3b3 100644 --- a/db/migrate/20231110043543_add_manifest_url_to_swf_assets.rb +++ b/db/migrate/20231110043543_add_manifest_url_to_swf_assets.rb @@ -1,55 +1,5 @@ class AddManifestUrlToSwfAssets < ActiveRecord::Migration[7.1] def change add_column :swf_assets, :manifest_url, :string - - # Okay, this is a big one to run upward! We're going to infer the manifest - # for as many assets as we can! - reversible do |direction| - direction.up do - Net::HTTP.start("images.neopets.com", 443, use_ssl: true) do |http| - SwfAsset.find_each do |swf_asset| - begin - manifest_url = infer_manifest_url(http, swf_asset.url) - rescue StandardError => error - Rails.logger.warn "Could not infer manifest URL for #{swf_asset.id}, skipping: #{error.message}" - next - end - - Rails.logger.info "#{swf_asset.id}: #{manifest_url}" - swf_asset.manifest_url = manifest_url - swf_asset.save! - end - end - end - end - end - - SWF_URL_PATTERN = %r{^(?:https?:)?//images\.neopets\.com/cp/(bio|items)/swf/(.+?)_([a-z0-9]+)\.swf$} - def infer_manifest_url(http, swf_url) - url_match = swf_url.match(SWF_URL_PATTERN) - raise ArgumentError, "not a valid SWF URL: #{swf_url}" if url_match.nil? - - # Build the potential manifest URLs, from the two structures we know of. - type, folders, hash_str = url_match.captures - potential_manifest_urls = [ - "https://images.neopets.com/cp/#{type}/data/#{folders}/manifest.json", - "https://images.neopets.com/cp/#{type}/data/#{folders}_#{hash_str}/manifest.json", - ] - - # Send a HEAD request to test each manifest URL, without downloading its - # content. If it succeeds, we're done! - potential_manifest_urls.each do |potential_manifest_url| - res = http.head potential_manifest_url - if res.is_a? Net::HTTPOK - return potential_manifest_url - elsif res.is_a? Net::HTTPNotFound - next - else - raise "unexpected manifest response code: #{res.code}" - end - end - - # Otherwise, there's no valid manifest URL. - raise "none of the common manifest URL patterns returned HTTP 200" end end diff --git a/lib/tasks/swf_assets.rake b/lib/tasks/swf_assets.rake new file mode 100644 index 00000000..45edbad6 --- /dev/null +++ b/lib/tasks/swf_assets.rake @@ -0,0 +1,93 @@ +require 'async/barrier' +require 'async/http/internet/instance' + +namespace :swf_assets do + desc "Backfill manifest_url for SwfAsset models" + task manifests: [:environment] do + assets = SwfAsset.where(manifest_url: nil) + count = assets.count + puts "Found #{count} assets without manifests" + + Async do + # Share a pool of persistent connections, rather than reconnecting on + # each request. (This library does that automatically!) + internet = Async::HTTP::Internet.instance + + # Load the assets in batches, then process each batch in two steps: first + # inferring all manifest URLs in the batch, then saving all assets in the + # batch. (This makes the update step more efficient, and it also avoids + # simultaneous queries across the fibers, which ActiveRecord disallows!) + # + # We keep track of a shared index `i` here, but we only actually + # increment it once each task is *done*, so that the numbers output in + # the right order! + i = 0 + assets.find_in_batches(batch_size: 1000) do |batch| + # Create a barrier, to let us wait on all the tasks; then under it + # create a semaphore, to limit how many tasks run at once. + barrier = Async::Barrier.new + semaphore = Async::Semaphore.new(100, parent: barrier) + + batch.each do |asset| + semaphore.async do |task| + manifest_url = nil + begin + task.with_timeout(5) do + manifest_url = infer_manifest_url(asset.url, internet) + end + rescue StandardError => error + i += 1 + puts "[#{i}/#{count}] ⚠️ Skipping #{asset.id}: #{error.message}" + next + end + + i += 1 + puts "[#{i}/#{count}] Manifest for #{asset.id}: #{manifest_url}" + + # Write, but don't yet save, the manifest URL. + asset.manifest_url = manifest_url + end + end + + # Wait for all the above tasks to finish. (Then, all of the assets that + # succeeded should have an unsaved `manifest_url` change.) + barrier.wait + + # Save all of the assets in the batch. (We do this in a transaction not + # for the transactional semantics, but because it's notably faster than + # doing a commit between each query, which is what sending the queries + # individually would effectively do!) + SwfAsset.transaction { batch.each(&:save!) } + end + end + end +end + +SWF_URL_PATTERN = %r{^(?:https?:)?//images\.neopets\.com/cp/(bio|items)/swf/(.+?)_([a-z0-9]+)\.swf$} +def infer_manifest_url(swf_url, internet) + url_match = swf_url.match(SWF_URL_PATTERN) + raise ArgumentError, "not a valid SWF URL: #{swf_url}" if url_match.nil? + + # Build the potential manifest URLs, from the two structures we know of. + type, folders, hash_str = url_match.captures + potential_manifest_urls = [ + "https://images.neopets.com/cp/#{type}/data/#{folders}/manifest.json", + "https://images.neopets.com/cp/#{type}/data/#{folders}_#{hash_str}/manifest.json", + ] + + # Send a HEAD request to test each manifest URL, without downloading its + # content. If it succeeds, we're done! + potential_manifest_urls.each do |potential_manifest_url| + res = internet.head potential_manifest_url + if res.ok? + return potential_manifest_url + elsif res.status == 404 + next # Ok, this was not the manifest! + else + raise "unexpected manifest response code: #{res.status}" + end + end + + # Otherwise, there's no valid manifest URL. + raise "none of the common manifest URL patterns returned HTTP 200" +end diff --git a/vendor/cache/async-2.6.5.gem b/vendor/cache/async-2.6.5.gem new file mode 100644 index 00000000..01260f5e Binary files /dev/null and b/vendor/cache/async-2.6.5.gem differ diff --git a/vendor/cache/async-http-0.61.0.gem b/vendor/cache/async-http-0.61.0.gem new file mode 100644 index 00000000..53b41eb7 Binary files /dev/null and b/vendor/cache/async-http-0.61.0.gem differ diff --git a/vendor/cache/async-io-1.37.0.gem b/vendor/cache/async-io-1.37.0.gem new file mode 100644 index 00000000..ded29ecd Binary files /dev/null and b/vendor/cache/async-io-1.37.0.gem differ diff --git a/vendor/cache/async-pool-0.4.0.gem b/vendor/cache/async-pool-0.4.0.gem new file mode 100644 index 00000000..acd3b830 Binary files /dev/null and b/vendor/cache/async-pool-0.4.0.gem differ diff --git a/vendor/cache/console-1.23.2.gem b/vendor/cache/console-1.23.2.gem new file mode 100644 index 00000000..1503bd6e Binary files /dev/null and b/vendor/cache/console-1.23.2.gem differ diff --git a/vendor/cache/fiber-annotation-0.2.0.gem b/vendor/cache/fiber-annotation-0.2.0.gem new file mode 100644 index 00000000..d93eda3e Binary files /dev/null and b/vendor/cache/fiber-annotation-0.2.0.gem differ diff --git a/vendor/cache/fiber-local-1.0.0.gem b/vendor/cache/fiber-local-1.0.0.gem new file mode 100644 index 00000000..68323423 Binary files /dev/null and b/vendor/cache/fiber-local-1.0.0.gem differ diff --git a/vendor/cache/io-event-1.3.3.gem b/vendor/cache/io-event-1.3.3.gem new file mode 100644 index 00000000..e6ad915b Binary files /dev/null and b/vendor/cache/io-event-1.3.3.gem differ diff --git a/vendor/cache/protocol-hpack-1.4.2.gem b/vendor/cache/protocol-hpack-1.4.2.gem new file mode 100644 index 00000000..b659b095 Binary files /dev/null and b/vendor/cache/protocol-hpack-1.4.2.gem differ diff --git a/vendor/cache/protocol-http-0.25.0.gem b/vendor/cache/protocol-http-0.25.0.gem new file mode 100644 index 00000000..0e2996bb Binary files /dev/null and b/vendor/cache/protocol-http-0.25.0.gem differ diff --git a/vendor/cache/protocol-http1-0.16.0.gem b/vendor/cache/protocol-http1-0.16.0.gem new file mode 100644 index 00000000..71f08ba5 Binary files /dev/null and b/vendor/cache/protocol-http1-0.16.0.gem differ diff --git a/vendor/cache/protocol-http2-0.15.1.gem b/vendor/cache/protocol-http2-0.15.1.gem new file mode 100644 index 00000000..495adf95 Binary files /dev/null and b/vendor/cache/protocol-http2-0.15.1.gem differ diff --git a/vendor/cache/thread-local-1.1.0.gem b/vendor/cache/thread-local-1.1.0.gem new file mode 100644 index 00000000..ed853232 Binary files /dev/null and b/vendor/cache/thread-local-1.1.0.gem differ diff --git a/vendor/cache/timers-4.3.5.gem b/vendor/cache/timers-4.3.5.gem new file mode 100644 index 00000000..4214cd52 Binary files /dev/null and b/vendor/cache/timers-4.3.5.gem differ diff --git a/vendor/cache/traces-0.11.1.gem b/vendor/cache/traces-0.11.1.gem new file mode 100644 index 00000000..dcff8e28 Binary files /dev/null and b/vendor/cache/traces-0.11.1.gem differ