Emi Matchu
95949da6f9
I'm not sure where these duplicate records have been coming from over the years (I checked the timestamps and it's been happening occasionally since 2013 up to late last year, there were ~1,600 instances), but for now let's just get rid of them! This is related to the issues we've been addressing lately where some biology assets have manifests but no PNG specified in them: the older copies of the assets would have our generated PNG as a fallback, but the newer copies would get served as part of the pet appearance *in addition to* the older copies, and the newer copies would be marked as having no DTI-generated image, which our system wasn't always able to handle. We've primarily been addressing this by leaning into more graceful failure modes of skipping certain layers, but… these layers *shouldn't be here*, and are cluttering up support tools and such; let's be rid of them! I ran this today seemingly without issue, but I kept a backup of the `yarn db:export:public-data` task in `impress-2020` to be able to check and rollback if we discover a mistake. One last note: the `ORDER BY` clause in the `GROUP_CONCAT` call was a late addition, *after* I ran this in production. Scanning the console output, it seems like ordering by ID was MySQL's default behavior here anyway (makes sense!), so I'm not gonna bother to rollback and re-run, but I think specifying this is helpful to ensure we're not depending on unspecified behavior and to be really clear about our intentions of which record to keep (the one with the smallest DTI ID number).
140 lines
4.9 KiB
Ruby
140 lines
4.9 KiB
Ruby
require 'async/barrier'
|
|
require 'async/http/internet/instance'
|
|
|
|
namespace :swf_assets do
|
|
# NOTE: I'm not sure how these duplicate records enter our database, probably
|
|
# a bug in the modeling code somewhere? For now, let's just remove them, and
|
|
# be ready to run it again if needed!
|
|
# NOTE: Run with DRY_RUN=1 to see what it would do first!
|
|
desc "Remove duplicate SwfAsset records"
|
|
task remove_duplicates: [:environment] do
|
|
duplicate_groups = SwfAsset.group(:type, :remote_id).
|
|
having("COUNT(*) > 1").
|
|
pluck(:type, :remote_id, Arel.sql("GROUP_CONCAT(id ORDER BY id ASC)"))
|
|
|
|
total = duplicate_groups.size
|
|
puts "Found #{total} groups of duplicate records"
|
|
|
|
SwfAsset.transaction do
|
|
duplicate_groups.each_with_index do |(type, remote_id, ids_str), index|
|
|
ids = ids_str.split(",")
|
|
duplicate_ids = ids[1..]
|
|
duplicate_records = SwfAsset.find(duplicate_ids)
|
|
|
|
if ENV["DRY_RUN"]
|
|
puts "[#{index + 1}/#{total}] #{type}/#{remote_id}: " +
|
|
"Would delete #{duplicate_records.size} records " +
|
|
"(#{duplicate_records.map(&:id).join(", ")})"
|
|
else
|
|
puts "[#{index + 1}/#{total}] #{type}/#{remote_id}: " +
|
|
"Deleting #{duplicate_records.size} records " +
|
|
"(#{duplicate_records.map(&:id).join(", ")})"
|
|
duplicate_records.each(&:destroy)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
desc "Backfill manifest_url for SwfAsset models"
|
|
task manifests: [:environment] do
|
|
timeout = ENV.fetch("TIMEOUT", "5").to_i
|
|
|
|
assets = SwfAsset.where(manifest_url: nil)
|
|
count = assets.count
|
|
puts "Found #{count} assets without manifests"
|
|
|
|
Async do
|
|
# Share a pool of persistent connections, rather than reconnecting on
|
|
# each request. (This library does that automatically!)
|
|
internet = Async::HTTP::Internet.instance
|
|
|
|
# Load the assets in batches, then process each batch in two steps: first
|
|
# inferring all manifest URLs in the batch, then saving all assets in the
|
|
# batch. (This makes the update step more efficient, and it also avoids
|
|
# simultaneous queries across the fibers, which ActiveRecord disallows!)
|
|
#
|
|
# We keep track of a shared index `i` here, but we only actually
|
|
# increment it once each task is *done*, so that the numbers output in
|
|
# the right order!
|
|
i = 0
|
|
assets.find_in_batches(batch_size: 1000) do |batch|
|
|
# Create a barrier, to let us wait on all the tasks; then under it
|
|
# create a semaphore, to limit how many tasks run at once.
|
|
barrier = Async::Barrier.new
|
|
semaphore = Async::Semaphore.new(100, parent: barrier)
|
|
|
|
batch.each do |asset|
|
|
semaphore.async do |task|
|
|
manifest_url = nil
|
|
begin
|
|
task.with_timeout(timeout) do
|
|
manifest_url = infer_manifest_url(asset.url, internet)
|
|
end
|
|
rescue StandardError => error
|
|
i += 1
|
|
puts "[#{i}/#{count}] ⚠️ Skipping #{asset.id}: #{error.message}"
|
|
next
|
|
end
|
|
|
|
i += 1
|
|
puts "[#{i}/#{count}] Manifest for #{asset.id}: #{manifest_url}"
|
|
|
|
# Write, but don't yet save, the manifest URL.
|
|
asset.manifest_url = manifest_url
|
|
end
|
|
end
|
|
|
|
# Wait for all the above tasks to finish. (Then, all of the assets that
|
|
# succeeded should have an unsaved `manifest_url` change.)
|
|
barrier.wait
|
|
|
|
# Save all of the assets in the batch. (We do this in a transaction not
|
|
# for the transactional semantics, but because it's notably faster than
|
|
# doing a commit between each query, which is what sending the queries
|
|
# individually would effectively do!)
|
|
begin
|
|
SwfAsset.transaction do
|
|
batch.each do |asset|
|
|
begin
|
|
asset.save!
|
|
rescue StandardError => error
|
|
puts "⚠️ Saving asset #{asset.id} failed: #{error.full_message}"
|
|
end
|
|
end
|
|
end
|
|
rescue StandardError => error
|
|
puts "⚠️ Saving this batch failed: #{error.full_message}"
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
SWF_URL_PATTERN = %r{^(?:https?:)?//images\.neopets\.com/cp/(bio|items)/swf/(.+?)_([a-z0-9]+)\.swf$}
|
|
def infer_manifest_url(swf_url, internet)
|
|
url_match = swf_url.match(SWF_URL_PATTERN)
|
|
raise ArgumentError, "not a valid SWF URL: #{swf_url}" if url_match.nil?
|
|
|
|
# Build the potential manifest URLs, from the two structures we know of.
|
|
type, folders, hash_str = url_match.captures
|
|
potential_manifest_urls = [
|
|
"https://images.neopets.com/cp/#{type}/data/#{folders}/manifest.json",
|
|
"https://images.neopets.com/cp/#{type}/data/#{folders}_#{hash_str}/manifest.json",
|
|
]
|
|
|
|
# Send a HEAD request to test each manifest URL, without downloading its
|
|
# content. If it succeeds, we're done!
|
|
potential_manifest_urls.each do |potential_manifest_url|
|
|
res = internet.head potential_manifest_url
|
|
if res.ok?
|
|
return potential_manifest_url
|
|
elsif res.status == 404
|
|
next # Ok, this was not the manifest!
|
|
else
|
|
raise "unexpected manifest response code: #{res.status}"
|
|
end
|
|
end
|
|
|
|
# Otherwise, there's no valid manifest URL.
|
|
raise "all of the common manifest URL patterns returned HTTP 404"
|
|
end
|