Add manifest_url to swf_assets table
Ok so, impress-2020 guesses the manifest URL every time based on common URL patterns. But the right way to do this is to read it from the modeling data! But also, we don't have a great way to get the modeling data directly. (Though as I write this, I guess we do have that auto-modeling trick we use in the DTI 2020 codebase, I wonder if that could work for this too?) So anyway, in this change, we update the modeling code to save the manifest URL, and also the migration includes a big block that attempts to run impress-2020's manifest-guessing logic for every asset and save the result! It's uhh. Not fast. It runs at about 1 asset per second (a lot of these aren't cache hits), and sometimes stalls out. And we have >600k assets, so the estimated wall time is uhh. Seven days? I think there's something we could do here around like, concurrent execution? Though tbqh with the nature of the slowness being seemingly about hitting the slow underlying images.neopets.com server, I don't actually have a lot of faith that concurrency would actually be faster? I also think it could be sensible to like… extract this from the migration, and run it as a script to infer missing manifest URLs. That would be easier to run in chunks and resume if something goes wrong. Cuz like, I think my reasoning here was that backfilling this data was part of the migration process… but the thing is, this migration can't reliably get a manifest for everything (both cuz it depends on an external service and cuz not everything has one), so it's a perfectly valid migration to just leave the column as null for all the rows to start, and fill this in later. I wish I'd written it like that! But anyway, I'm just running this for now, and taking a break for the night. Maybe later I'll come around and extract this into a separate task to just try this on all assets missing manifests instead!
This commit is contained in:
parent
3a963c7d25
commit
96998643b5
4 changed files with 68 additions and 2 deletions
|
@ -16,6 +16,8 @@ class SwfAsset < ApplicationRecord
|
||||||
|
|
||||||
scope :includes_depth, -> { includes(:zone) }
|
scope :includes_depth, -> { includes(:zone) }
|
||||||
|
|
||||||
|
before_validation :normalize_manifest_url
|
||||||
|
|
||||||
def swf_image_dir
|
def swf_image_dir
|
||||||
@swf_image_dir ||= Rails.root.join('tmp', 'asset_images_before_upload', self.id.to_s)
|
@swf_image_dir ||= Rails.root.join('tmp', 'asset_images_before_upload', self.id.to_s)
|
||||||
end
|
end
|
||||||
|
@ -141,6 +143,7 @@ class SwfAsset < ApplicationRecord
|
||||||
self.zone_id = data[:zone_id].to_i
|
self.zone_id = data[:zone_id].to_i
|
||||||
self.url = data[:asset_url]
|
self.url = data[:asset_url]
|
||||||
self.zones_restrict = data[:zones_restrict]
|
self.zones_restrict = data[:zones_restrict]
|
||||||
|
self.manifest_url = data[:manifest]
|
||||||
end
|
end
|
||||||
|
|
||||||
def origin_object_data=(data)
|
def origin_object_data=(data)
|
||||||
|
@ -149,6 +152,7 @@ class SwfAsset < ApplicationRecord
|
||||||
self.zone_id = data[:zone_id].to_i
|
self.zone_id = data[:zone_id].to_i
|
||||||
self.url = data[:asset_url]
|
self.url = data[:asset_url]
|
||||||
self.zones_restrict = ""
|
self.zones_restrict = ""
|
||||||
|
self.manifest_url = data[:manifest]
|
||||||
end
|
end
|
||||||
|
|
||||||
def mall_data=(data)
|
def mall_data=(data)
|
||||||
|
@ -156,6 +160,12 @@ class SwfAsset < ApplicationRecord
|
||||||
self.url = "https://images.neopets.com/#{data['url']}"
|
self.url = "https://images.neopets.com/#{data['url']}"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def normalize_manifest_url
|
||||||
|
parsed_manifest_url = Addressable::URI.parse(manifest_url)
|
||||||
|
parsed_manifest_url.scheme = "https"
|
||||||
|
self.manifest_url = parsed_manifest_url.to_s
|
||||||
|
end
|
||||||
|
|
||||||
def self.from_wardrobe_link_params(ids)
|
def self.from_wardrobe_link_params(ids)
|
||||||
where((
|
where((
|
||||||
arel_table[:remote_id].in(ids[:biology]).and(arel_table[:type].eq('biology'))
|
arel_table[:remote_id].in(ids[:biology]).and(arel_table[:type].eq('biology'))
|
||||||
|
|
55
db/migrate/20231110043543_add_manifest_url_to_swf_assets.rb
Normal file
55
db/migrate/20231110043543_add_manifest_url_to_swf_assets.rb
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
class AddManifestUrlToSwfAssets < ActiveRecord::Migration[7.1]
|
||||||
|
def change
|
||||||
|
add_column :swf_assets, :manifest_url, :string
|
||||||
|
|
||||||
|
# Okay, this is a big one to run upward! We're going to infer the manifest
|
||||||
|
# for as many assets as we can!
|
||||||
|
reversible do |direction|
|
||||||
|
direction.up do
|
||||||
|
Net::HTTP.start("images.neopets.com", 443, use_ssl: true) do |http|
|
||||||
|
SwfAsset.find_each do |swf_asset|
|
||||||
|
begin
|
||||||
|
manifest_url = infer_manifest_url(http, swf_asset.url)
|
||||||
|
rescue StandardError => error
|
||||||
|
Rails.logger.warn "Could not infer manifest URL for #{swf_asset.id}, skipping: #{error.message}"
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
Rails.logger.info "#{swf_asset.id}: #{manifest_url}"
|
||||||
|
swf_asset.manifest_url = manifest_url
|
||||||
|
swf_asset.save!
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
SWF_URL_PATTERN = %r{^(?:https?:)?//images\.neopets\.com/cp/(bio|items)/swf/(.+?)_([a-z0-9]+)\.swf$}
|
||||||
|
def infer_manifest_url(http, swf_url)
|
||||||
|
url_match = swf_url.match(SWF_URL_PATTERN)
|
||||||
|
raise ArgumentError, "not a valid SWF URL: #{swf_url}" if url_match.nil?
|
||||||
|
|
||||||
|
# Build the potential manifest URLs, from the two structures we know of.
|
||||||
|
type, folders, hash_str = url_match.captures
|
||||||
|
potential_manifest_urls = [
|
||||||
|
"https://images.neopets.com/cp/#{type}/data/#{folders}/manifest.json",
|
||||||
|
"https://images.neopets.com/cp/#{type}/data/#{folders}_#{hash_str}/manifest.json",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Send a HEAD request to test each manifest URL, without downloading its
|
||||||
|
# content. If it succeeds, we're done!
|
||||||
|
potential_manifest_urls.each do |potential_manifest_url|
|
||||||
|
res = http.head potential_manifest_url
|
||||||
|
if res.is_a? Net::HTTPOK
|
||||||
|
return potential_manifest_url
|
||||||
|
elsif res.is_a? Net::HTTPNotFound
|
||||||
|
next
|
||||||
|
else
|
||||||
|
raise "unexpected manifest response code: #{res.code}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Otherwise, there's no valid manifest URL.
|
||||||
|
raise "none of the common manifest URL patterns returned HTTP 200"
|
||||||
|
end
|
||||||
|
end
|
|
@ -10,7 +10,7 @@
|
||||||
#
|
#
|
||||||
# It's strongly recommended that you check this file into your version control system.
|
# It's strongly recommended that you check this file into your version control system.
|
||||||
|
|
||||||
ActiveRecord::Schema[7.0].define(version: 2023_08_07_005748) do
|
ActiveRecord::Schema[7.1].define(version: 2023_08_07_005748) do
|
||||||
create_table "users", id: { type: :integer, unsigned: true }, charset: "utf8mb3", force: :cascade do |t|
|
create_table "users", id: { type: :integer, unsigned: true }, charset: "utf8mb3", force: :cascade do |t|
|
||||||
t.string "name", limit: 20, null: false
|
t.string "name", limit: 20, null: false
|
||||||
t.string "encrypted_password", limit: 64, null: false
|
t.string "encrypted_password", limit: 64, null: false
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
#
|
#
|
||||||
# It's strongly recommended that you check this file into your version control system.
|
# It's strongly recommended that you check this file into your version control system.
|
||||||
|
|
||||||
ActiveRecord::Schema[7.0].define(version: 2023_10_24_221826) do
|
ActiveRecord::Schema[7.1].define(version: 2023_11_10_043543) do
|
||||||
create_table "auth_servers", id: :integer, charset: "latin1", force: :cascade do |t|
|
create_table "auth_servers", id: :integer, charset: "latin1", force: :cascade do |t|
|
||||||
t.string "short_name", limit: 10, null: false
|
t.string "short_name", limit: 10, null: false
|
||||||
t.string "name", limit: 40, null: false
|
t.string "name", limit: 40, null: false
|
||||||
|
@ -264,6 +264,7 @@ ActiveRecord::Schema[7.0].define(version: 2023_10_24_221826) do
|
||||||
t.text "manifest", size: :medium
|
t.text "manifest", size: :medium
|
||||||
t.timestamp "manifest_cached_at"
|
t.timestamp "manifest_cached_at"
|
||||||
t.string "known_glitches", limit: 128, default: ""
|
t.string "known_glitches", limit: 128, default: ""
|
||||||
|
t.string "manifest_url"
|
||||||
t.index ["body_id"], name: "swf_assets_body_id_and_object_id"
|
t.index ["body_id"], name: "swf_assets_body_id_and_object_id"
|
||||||
t.index ["type", "remote_id"], name: "swf_assets_type_and_id"
|
t.index ["type", "remote_id"], name: "swf_assets_type_and_id"
|
||||||
t.index ["zone_id"], name: "idx_swf_assets_zone_id"
|
t.index ["zone_id"], name: "idx_swf_assets_zone_id"
|
||||||
|
|
Loading…
Reference in a new issue