impress/lib/tasks/swf_assets.rake

require 'async/barrier'
require 'async/http/internet/instance'

namespace :swf_assets do
	# NOTE: I'm not sure how these duplicate records enter our database, probably
	# a bug in the modeling code somewhere? For now, let's just remove them, and
	# be ready to run it again if needed!
	# NOTE: Run with DRY_RUN=1 to see what it would do first!
	desc "Remove duplicate SwfAsset records"
	task remove_duplicates: [:environment] do
		duplicate_groups = SwfAsset.group(:type, :remote_id).
			having("COUNT(*) > 1").
			pluck(:type, :remote_id, Arel.sql("GROUP_CONCAT(id ORDER BY id ASC)"))

		total = duplicate_groups.size
		puts "Found #{total} groups of duplicate records"

		SwfAsset.transaction do
			duplicate_groups.each_with_index do |(type, remote_id, ids_str), index|
				ids = ids_str.split(",")
				duplicate_ids = ids[1..]
				duplicate_records = SwfAsset.find(duplicate_ids)

				if ENV["DRY_RUN"]
					puts "[#{index + 1}/#{total}] #{type}/#{remote_id}: " + 
						"Would delete #{duplicate_records.size} records " +
						"(#{duplicate_records.map(&:id).join(", ")})"
				else
					puts "[#{index + 1}/#{total}] #{type}/#{remote_id}: " + 
						"Deleting #{duplicate_records.size} records " +
						"(#{duplicate_records.map(&:id).join(", ")})"
					duplicate_records.each(&:destroy)
				end
			end
		end
	end

	namespace :manifests do
		desc "Save all known manifests to the Neopets Media Archive"
		task load: [:environment] do
			# Log errors to STDOUT, but we don't need the info messages about
			# successful saves.
			Rails.logger = Logger.new(STDOUT, level: :error)

			# Find all the manifests with known URLs. (We don't have a database
			# filter for "do we already have the manifest downloaded", but that's
			# okay, the preload method will quickly check that for us!)
			swf_assets = SwfAsset.where.not(manifest_url: nil)
			total_count = swf_assets.count
			puts "Found #{total_count} assets with manifests"

			# For each batch of 1000 assets, load their manifests concurrently.
			# Wrap everything in a top-level sync, so keyboard interrupts will
			# propagate all the way up to here, instead of just cancelling the
			# current batch.
			Sync do
				saved_count = 0
				swf_assets.find_in_batches(batch_size: 1000) do |swf_assets|
					SwfAsset.preload_manifests(swf_assets)
					saved_count += swf_assets.size
					puts "Loaded #{saved_count} of #{total_count} manifests"
				end
			end
		end

		desc "Backfill manifest_url for SwfAsset models"
		task urls: [:environment] do
			timeout = ENV.fetch("TIMEOUT", "5").to_i

			assets = SwfAsset.where(manifest_url: nil)
			count = assets.count
			puts "Found #{count} assets without manifests"

			Sync do
				# Share a pool of persistent connections, rather than reconnecting on
				# each request. (This library does that automatically!)
				internet = Async::HTTP::Internet.instance

				# Load the assets in batches, then process each batch in two steps: first
				# inferring all manifest URLs in the batch, then saving all assets in the
				# batch. (This makes the update step more efficient, and it also avoids
				# simultaneous queries across the fibers, which ActiveRecord disallows!)
				#
				# We keep track of a shared index `i` here, but we only actually
				# increment it once each task is *done*, so that the numbers output in
				# the right order!
				i = 0
				assets.find_in_batches(batch_size: 1000) do |batch|
					# Create a barrier, to let us wait on all the tasks; then under it
					# create a semaphore, to limit how many tasks run at once.
					barrier = Async::Barrier.new
					semaphore = Async::Semaphore.new(100, parent: barrier)

					batch.each do |asset|
						semaphore.async do |task|
							manifest_url = nil
							begin
								task.with_timeout(timeout) do
									manifest_url = infer_manifest_url(asset.url, internet)
								end
							rescue StandardError => error
								i += 1
								puts "[#{i}/#{count}] ⚠️  Skipping #{asset.id}: #{error.message}"
								next
							end

							i += 1
							puts "[#{i}/#{count}] Manifest for #{asset.id}: #{manifest_url}"

							# Write, but don't yet save, the manifest URL.
							asset.manifest_url = manifest_url
						end
					end

					# Wait for all the above tasks to finish. (Then, all of the assets that
					# succeeded should have an unsaved `manifest_url` change.)
					barrier.wait

					# Save all of the assets in the batch. (We do this in a transaction not
					# for the transactional semantics, but because it's notably faster than
					# doing a commit between each query, which is what sending the queries
					# individually would effectively do!)
					begin
						SwfAsset.transaction do
							batch.each do |asset|
								begin
									asset.save!
								rescue StandardError => error
									puts "⚠️  Saving asset #{asset.id} failed: #{error.full_message}"
								end
							end
						end
					rescue StandardError => error
						puts "⚠️  Saving this batch failed: #{error.full_message}"
					end
				end
			end
		end
	end
end

SWF_URL_PATTERN = %r{^(?:https?:)?//images\.neopets\.com/cp/(bio|items)/swf/(.+?)_([a-z0-9]+)\.swf$}
def infer_manifest_url(swf_url, internet)
	url_match = swf_url.match(SWF_URL_PATTERN)
	raise ArgumentError, "not a valid SWF URL: #{swf_url}" if url_match.nil?
	
	# Build the potential manifest URLs, from the two structures we know of.
	type, folders, hash_str = url_match.captures
	potential_manifest_urls = [
		"https://images.neopets.com/cp/#{type}/data/#{folders}/manifest.json",
		"https://images.neopets.com/cp/#{type}/data/#{folders}_#{hash_str}/manifest.json",
	]

	# Send a HEAD request to test each manifest URL, without downloading its
	# content. If it succeeds, we're done!
	potential_manifest_urls.each do |potential_manifest_url|
		res = internet.head potential_manifest_url
		if res.ok?
			return potential_manifest_url
		elsif res.status == 404
			next # Ok, this was not the manifest!
		else
			raise "unexpected manifest response code: #{res.status}"
		end
	end

	# Otherwise, there's no valid manifest URL.
	raise "all of the common manifest URL patterns returned HTTP 404"
end
Move manifest backfill to `swf_assets:manifests` task Okay, I've simplified the migration to just add the column, and instead added a task to find assets without manifest URLs and backfill them. Performance is a lot better now, using the `async-http` library, which as I understand it supports both persistent connections when invoked like this, and maybe also HTTP/2 multiplexing?? (Though I'm not actually sure images.neopets.com does lol) I'm not sure about the number of concurrent tasks I picked here, 100 seems okay for an internet thing and for such small requests, but I worry that the CDN is gonna get annoyed or something. Well, we'll see! This task is very resumable if it turns out we get frozen out or something. 2023-11-10 16:52:50 -08:00			`require 'async/barrier'`
			`require 'async/http/internet/instance'`

			`namespace :swf_assets do`
Create swf_assets:remove_duplicates task I'm not sure where these duplicate records have been coming from over the years (I checked the timestamps and it's been happening occasionally since 2013 up to late last year, there were ~1,600 instances), but for now let's just get rid of them! This is related to the issues we've been addressing lately where some biology assets have manifests but no PNG specified in them: the older copies of the assets would have our generated PNG as a fallback, but the newer copies would get served as part of the pet appearance in addition to the older copies, and the newer copies would be marked as having no DTI-generated image, which our system wasn't always able to handle. We've primarily been addressing this by leaning into more graceful failure modes of skipping certain layers, but… these layers shouldn't be here, and are cluttering up support tools and such; let's be rid of them! I ran this today seemingly without issue, but I kept a backup of the `yarn db:export:public-data` task in `impress-2020` to be able to check and rollback if we discover a mistake. One last note: the `ORDER BY` clause in the `GROUP_CONCAT` call was a late addition, after I ran this in production. Scanning the console output, it seems like ordering by ID was MySQL's default behavior here anyway (makes sense!), so I'm not gonna bother to rollback and re-run, but I think specifying this is helpful to ensure we're not depending on unspecified behavior and to be really clear about our intentions of which record to keep (the one with the smallest DTI ID number). 2024-02-09 09:53:41 -08:00			`# NOTE: I'm not sure how these duplicate records enter our database, probably`
			`# a bug in the modeling code somewhere? For now, let's just remove them, and`
			`# be ready to run it again if needed!`
			`# NOTE: Run with DRY_RUN=1 to see what it would do first!`
			`desc "Remove duplicate SwfAsset records"`
			`task remove_duplicates: [:environment] do`
			`duplicate_groups = SwfAsset.group(:type, :remote_id).`
			`having("COUNT(*) > 1").`
			`pluck(:type, :remote_id, Arel.sql("GROUP_CONCAT(id ORDER BY id ASC)"))`

			`total = duplicate_groups.size`
			`puts "Found #{total} groups of duplicate records"`

			`SwfAsset.transaction do`
			`duplicate_groups.each_with_index do \|(type, remote_id, ids_str), index\|`
			`ids = ids_str.split(",")`
			`duplicate_ids = ids[1..]`
			`duplicate_records = SwfAsset.find(duplicate_ids)`

			`if ENV["DRY_RUN"]`
			`puts "[#{index + 1}/#{total}] #{type}/#{remote_id}: " +`
			`"Would delete #{duplicate_records.size} records " +`
			`"(#{duplicate_records.map(&:id).join(", ")})"`
			`else`
			`puts "[#{index + 1}/#{total}] #{type}/#{remote_id}: " +`
			`"Deleting #{duplicate_records.size} records " +`
			`"(#{duplicate_records.map(&:id).join(", ")})"`
			`duplicate_records.each(&:destroy)`
			`end`
			`end`
			`end`
			`end`

Create swf_assets:manifests:load task to save all manifest files Doing that sweet, sweet backfill!! It's not exactly fast, since there's about 570k records to work through, but it's pretty good all things considered! Thanks, surprisingly-reusable async code! 2024-02-23 14:06:49 -08:00			`namespace :manifests do`
			`desc "Save all known manifests to the Neopets Media Archive"`
			`task load: [:environment] do`
			`# Log errors to STDOUT, but we don't need the info messages about`
			`# successful saves.`
			`Rails.logger = Logger.new(STDOUT, level: :error)`
Move manifest backfill to `swf_assets:manifests` task Okay, I've simplified the migration to just add the column, and instead added a task to find assets without manifest URLs and backfill them. Performance is a lot better now, using the `async-http` library, which as I understand it supports both persistent connections when invoked like this, and maybe also HTTP/2 multiplexing?? (Though I'm not actually sure images.neopets.com does lol) I'm not sure about the number of concurrent tasks I picked here, 100 seems okay for an internet thing and for such small requests, but I worry that the CDN is gonna get annoyed or something. Well, we'll see! This task is very resumable if it turns out we get frozen out or something. 2023-11-10 16:52:50 -08:00
Create swf_assets:manifests:load task to save all manifest files Doing that sweet, sweet backfill!! It's not exactly fast, since there's about 570k records to work through, but it's pretty good all things considered! Thanks, surprisingly-reusable async code! 2024-02-23 14:06:49 -08:00			`# Find all the manifests with known URLs. (We don't have a database`
			`# filter for "do we already have the manifest downloaded", but that's`
			`# okay, the preload method will quickly check that for us!)`
			`swf_assets = SwfAsset.where.not(manifest_url: nil)`
			`total_count = swf_assets.count`
			`puts "Found #{total_count} assets with manifests"`
Move manifest backfill to `swf_assets:manifests` task Okay, I've simplified the migration to just add the column, and instead added a task to find assets without manifest URLs and backfill them. Performance is a lot better now, using the `async-http` library, which as I understand it supports both persistent connections when invoked like this, and maybe also HTTP/2 multiplexing?? (Though I'm not actually sure images.neopets.com does lol) I'm not sure about the number of concurrent tasks I picked here, 100 seems okay for an internet thing and for such small requests, but I worry that the CDN is gonna get annoyed or something. Well, we'll see! This task is very resumable if it turns out we get frozen out or something. 2023-11-10 16:52:50 -08:00
Create swf_assets:manifests:load task to save all manifest files Doing that sweet, sweet backfill!! It's not exactly fast, since there's about 570k records to work through, but it's pretty good all things considered! Thanks, surprisingly-reusable async code! 2024-02-23 14:06:49 -08:00			`# For each batch of 1000 assets, load their manifests concurrently.`
			`# Wrap everything in a top-level sync, so keyboard interrupts will`
			`# propagate all the way up to here, instead of just cancelling the`
			`# current batch.`
			`Sync do`
			`saved_count = 0`
			`swf_assets.find_in_batches(batch_size: 1000) do \|swf_assets\|`
Save manifest load info when preloading them, too This was a bit tricky! When I initially turned it on, running `rails swf_assets:manifests:load` would trigger database errors of "oh no we can't get a connection from the pool!", because too many records were trying to concurrently save at once. So now, we give ourselves the ability to say `save_changes: false`, and then save them all in one batch after! That way, we're still saving by default in the edge cases where we're downloading and saving a manifest on the fly, but batching them in cases where we're likely to be dealing with a lot of them! 2024-02-25 16:02:36 -08:00			`SwfAsset.preload_manifests(swf_assets)`
Create swf_assets:manifests:load task to save all manifest files Doing that sweet, sweet backfill!! It's not exactly fast, since there's about 570k records to work through, but it's pretty good all things considered! Thanks, surprisingly-reusable async code! 2024-02-23 14:06:49 -08:00			`saved_count += swf_assets.size`
			`puts "Loaded #{saved_count} of #{total_count} manifests"`
Move manifest backfill to `swf_assets:manifests` task Okay, I've simplified the migration to just add the column, and instead added a task to find assets without manifest URLs and backfill them. Performance is a lot better now, using the `async-http` library, which as I understand it supports both persistent connections when invoked like this, and maybe also HTTP/2 multiplexing?? (Though I'm not actually sure images.neopets.com does lol) I'm not sure about the number of concurrent tasks I picked here, 100 seems okay for an internet thing and for such small requests, but I worry that the CDN is gonna get annoyed or something. Well, we'll see! This task is very resumable if it turns out we get frozen out or something. 2023-11-10 16:52:50 -08:00			`end`
Create swf_assets:manifests:load task to save all manifest files Doing that sweet, sweet backfill!! It's not exactly fast, since there's about 570k records to work through, but it's pretty good all things considered! Thanks, surprisingly-reusable async code! 2024-02-23 14:06:49 -08:00			`end`
			`end`

			`desc "Backfill manifest_url for SwfAsset models"`
			`task urls: [:environment] do`
			`timeout = ENV.fetch("TIMEOUT", "5").to_i`

			`assets = SwfAsset.where(manifest_url: nil)`
			`count = assets.count`
			`puts "Found #{count} assets without manifests"`

			`Sync do`
			`# Share a pool of persistent connections, rather than reconnecting on`
			`# each request. (This library does that automatically!)`
			`internet = Async::HTTP::Internet.instance`
Move manifest backfill to `swf_assets:manifests` task Okay, I've simplified the migration to just add the column, and instead added a task to find assets without manifest URLs and backfill them. Performance is a lot better now, using the `async-http` library, which as I understand it supports both persistent connections when invoked like this, and maybe also HTTP/2 multiplexing?? (Though I'm not actually sure images.neopets.com does lol) I'm not sure about the number of concurrent tasks I picked here, 100 seems okay for an internet thing and for such small requests, but I worry that the CDN is gonna get annoyed or something. Well, we'll see! This task is very resumable if it turns out we get frozen out or something. 2023-11-10 16:52:50 -08:00
Create swf_assets:manifests:load task to save all manifest files Doing that sweet, sweet backfill!! It's not exactly fast, since there's about 570k records to work through, but it's pretty good all things considered! Thanks, surprisingly-reusable async code! 2024-02-23 14:06:49 -08:00			`# Load the assets in batches, then process each batch in two steps: first`
			`# inferring all manifest URLs in the batch, then saving all assets in the`
			`# batch. (This makes the update step more efficient, and it also avoids`
			`# simultaneous queries across the fibers, which ActiveRecord disallows!)`
			`#`
			# We keep track of a shared index `i` here, but we only actually
			`# increment it once each task is done, so that the numbers output in`
			`# the right order!`
			`i = 0`
			`assets.find_in_batches(batch_size: 1000) do \|batch\|`
			`# Create a barrier, to let us wait on all the tasks; then under it`
			`# create a semaphore, to limit how many tasks run at once.`
			`barrier = Async::Barrier.new`
			`semaphore = Async::Semaphore.new(100, parent: barrier)`

			`batch.each do \|asset\|`
			`semaphore.async do \|task\|`
			`manifest_url = nil`
More gracefully handle batches that fail to save I noticed a thing with like, an asset that I think referenced an item that doesn't exist, which caused an error in the `body_specific?` validation step? Tbh that validation step needs fixed up in a number of ways, but I'm scared to, since it's hard to know what will break modeling lol. But in any case, more graceful handling is nice! If something happens, I'd rather leave it null and try again later than have the job crash! 2023-11-10 17:42:56 -08:00			`begin`
Create swf_assets:manifests:load task to save all manifest files Doing that sweet, sweet backfill!! It's not exactly fast, since there's about 570k records to work through, but it's pretty good all things considered! Thanks, surprisingly-reusable async code! 2024-02-23 14:06:49 -08:00			`task.with_timeout(timeout) do`
			`manifest_url = infer_manifest_url(asset.url, internet)`
			`end`
More gracefully handle batches that fail to save I noticed a thing with like, an asset that I think referenced an item that doesn't exist, which caused an error in the `body_specific?` validation step? Tbh that validation step needs fixed up in a number of ways, but I'm scared to, since it's hard to know what will break modeling lol. But in any case, more graceful handling is nice! If something happens, I'd rather leave it null and try again later than have the job crash! 2023-11-10 17:42:56 -08:00			`rescue StandardError => error`
Create swf_assets:manifests:load task to save all manifest files Doing that sweet, sweet backfill!! It's not exactly fast, since there's about 570k records to work through, but it's pretty good all things considered! Thanks, surprisingly-reusable async code! 2024-02-23 14:06:49 -08:00			`i += 1`
			`puts "[#{i}/#{count}] ⚠️ Skipping #{asset.id}: #{error.message}"`
			`next`
			`end`

			`i += 1`
			`puts "[#{i}/#{count}] Manifest for #{asset.id}: #{manifest_url}"`

			`# Write, but don't yet save, the manifest URL.`
			`asset.manifest_url = manifest_url`
			`end`
			`end`

			`# Wait for all the above tasks to finish. (Then, all of the assets that`
			# succeeded should have an unsaved `manifest_url` change.)
			`barrier.wait`

			`# Save all of the assets in the batch. (We do this in a transaction not`
			`# for the transactional semantics, but because it's notably faster than`
			`# doing a commit between each query, which is what sending the queries`
			`# individually would effectively do!)`
			`begin`
			`SwfAsset.transaction do`
			`batch.each do \|asset\|`
			`begin`
			`asset.save!`
			`rescue StandardError => error`
			`puts "⚠️ Saving asset #{asset.id} failed: #{error.full_message}"`
			`end`
More gracefully handle batches that fail to save I noticed a thing with like, an asset that I think referenced an item that doesn't exist, which caused an error in the `body_specific?` validation step? Tbh that validation step needs fixed up in a number of ways, but I'm scared to, since it's hard to know what will break modeling lol. But in any case, more graceful handling is nice! If something happens, I'd rather leave it null and try again later than have the job crash! 2023-11-10 17:42:56 -08:00			`end`
			`end`
Create swf_assets:manifests:load task to save all manifest files Doing that sweet, sweet backfill!! It's not exactly fast, since there's about 570k records to work through, but it's pretty good all things considered! Thanks, surprisingly-reusable async code! 2024-02-23 14:06:49 -08:00			`rescue StandardError => error`
			`puts "⚠️ Saving this batch failed: #{error.full_message}"`
More gracefully handle batches that fail to save I noticed a thing with like, an asset that I think referenced an item that doesn't exist, which caused an error in the `body_specific?` validation step? Tbh that validation step needs fixed up in a number of ways, but I'm scared to, since it's hard to know what will break modeling lol. But in any case, more graceful handling is nice! If something happens, I'd rather leave it null and try again later than have the job crash! 2023-11-10 17:42:56 -08:00			`end`
			`end`
Move manifest backfill to `swf_assets:manifests` task Okay, I've simplified the migration to just add the column, and instead added a task to find assets without manifest URLs and backfill them. Performance is a lot better now, using the `async-http` library, which as I understand it supports both persistent connections when invoked like this, and maybe also HTTP/2 multiplexing?? (Though I'm not actually sure images.neopets.com does lol) I'm not sure about the number of concurrent tasks I picked here, 100 seems okay for an internet thing and for such small requests, but I worry that the CDN is gonna get annoyed or something. Well, we'll see! This task is very resumable if it turns out we get frozen out or something. 2023-11-10 16:52:50 -08:00			`end`
			`end`
			`end`
			`end`

			`SWF_URL_PATTERN = %r{^(?:https?:)?//images\.neopets\.com/cp/(bio\|items)/swf/(.+?)_([a-z0-9]+)\.swf$}`
			`def infer_manifest_url(swf_url, internet)`
Fix inconsistent indentation in swf_assets.rake My editor now flags this stuff better, thank you editor! 2024-02-23 12:03:34 -08:00			`url_match = swf_url.match(SWF_URL_PATTERN)`
			`raise ArgumentError, "not a valid SWF URL: #{swf_url}" if url_match.nil?`

			`# Build the potential manifest URLs, from the two structures we know of.`
			`type, folders, hash_str = url_match.captures`
			`potential_manifest_urls = [`
			`"https://images.neopets.com/cp/#{type}/data/#{folders}/manifest.json",`
			`"https://images.neopets.com/cp/#{type}/data/#{folders}_#{hash_str}/manifest.json",`
			`]`

			`# Send a HEAD request to test each manifest URL, without downloading its`
			`# content. If it succeeds, we're done!`
			`potential_manifest_urls.each do \|potential_manifest_url\|`
			`res = internet.head potential_manifest_url`
			`if res.ok?`
			`return potential_manifest_url`
			`elsif res.status == 404`
			`next # Ok, this was not the manifest!`
			`else`
			`raise "unexpected manifest response code: #{res.status}"`
			`end`
			`end`

			`# Otherwise, there's no valid manifest URL.`
			`raise "all of the common manifest URL patterns returned HTTP 404"`
Move manifest backfill to `swf_assets:manifests` task Okay, I've simplified the migration to just add the column, and instead added a task to find assets without manifest URLs and backfill them. Performance is a lot better now, using the `async-http` library, which as I understand it supports both persistent connections when invoked like this, and maybe also HTTP/2 multiplexing?? (Though I'm not actually sure images.neopets.com does lol) I'm not sure about the number of concurrent tasks I picked here, 100 seems okay for an internet thing and for such small requests, but I worry that the CDN is gonna get annoyed or something. Well, we'll see! This task is very resumable if it turns out we get frozen out or something. 2023-11-10 16:52:50 -08:00			`end`