1
0
Fork 0
forked from OpenNeo/impress

Fix inconsistent item string encoding

We discovered a previous string encoding bug fix that was causing crashes for some items, was casuing *other* items to get reencoded incorrectly.

In this change, we make the reencoding conditional, only if parsing as UTF-8 is failing.

We also include a temporary repair script, to run in production then delete—but held here in git history for posterity.
This commit is contained in:
Emi Matchu 2026-01-04 19:20:31 -08:00
parent 7430e12655
commit 83281591b3
2 changed files with 166 additions and 19 deletions

View file

@ -30,24 +30,17 @@ module RocketAMFExtensions
raise RocketAMF::AMFError.new(first_message_data)
end
# HACK: It seems to me that these messages come back with Windows-1250
# (or similar) encoding on the strings? I'm basing this on the
# Patchwork Staff item, whose description arrives as:
# HACK: Older items in Neopets' database have Windows-1250 encoding,
# while newer items use proper UTF-8. We detect which encoding was used
# by checking if the string is valid UTF-8, and only re-encode if needed.
#
# "That staff is cute, but dont use it as a walking stick \x96 I " +
# "dont think it will hold you up!"
# Example of Windows-1250 item: Patchwork Staff (57311), whose
# description contains byte 0x96 (en-dash in Windows-1250).
#
# And the `\x96` is meant to represent an endash, which it doesn't in
# UTF-8 or in most extended ASCII encodings, but *does* in Windows's
# specific extended ASCII.
#
# Idk if this is something to do with the AMFPHP spec or how the AMFPHP
# server code they use serializes strings (I couldn't find any
# reference to it?), or just their internal database encoding being
# passed along as-is, or what? But this seems to be the most correct
# interpretation I know how to do, so, let's do it!
# Example of UTF-8 item: Carnival Party Décor (80042), whose name
# contains proper UTF-8 bytes [195, 169] for the é character.
result.messages[0].data.body.tap do |body|
reencode_strings! body, "Windows-1250", "UTF-8"
reencode_strings_if_needed! body, "Windows-1250", "UTF-8"
end
end
@ -92,13 +85,17 @@ module RocketAMFExtensions
end
end
def reencode_strings!(target, from, to)
def reencode_strings_if_needed!(target, from, to)
if target.is_a? String
target.force_encoding(from).encode!(to)
# Only re-encode if the string is not valid UTF-8
# (indicating it's in the old Windows-1250 encoding)
unless target.valid_encoding?
target.force_encoding(from).encode!(to)
end
elsif target.is_a? Array
target.each { |x| reencode_strings!(x, from, to) }
target.each { |x| reencode_strings_if_needed!(x, from, to) }
elsif target.is_a? Hash
target.values.each { |x| reencode_strings!(x, from, to) }
target.values.each { |x| reencode_strings_if_needed!(x, from, to) }
end
end
end

150
lib/tasks/fix_encoding.rake Normal file
View file

@ -0,0 +1,150 @@
namespace :db do
desc "Fix double-encoded UTF-8 strings in item names and descriptions"
task fix_double_encoding: :environment do
puts "=" * 80
puts "Fix Double-Encoded Strings in Database"
puts "=" * 80
puts
# Define the double-encoding patterns and their fixes
# Each pattern maps: double-encoded string -> correct UTF-8 string
# Using byte arrays to avoid encoding issues in the source file itself
encoding_fixes = {
# Common accented characters (Ă© => é, etc.)
"\xC4\x82\xC2\xA9".force_encoding('UTF-8') => "\xC3\xA9".force_encoding('UTF-8'), # é
"\xC4\x82\xC2\xB1".force_encoding('UTF-8') => "\xC3\xB1".force_encoding('UTF-8'), # ñ
"\xC4\x82\xC2\xAD".force_encoding('UTF-8') => "\xC3\xAD".force_encoding('UTF-8'), # í
"\xC4\x82\xC2\xA1".force_encoding('UTF-8') => "\xC3\xA1".force_encoding('UTF-8'), # á
"\xC4\x82\xC2\xB3".force_encoding('UTF-8') => "\xC3\xB3".force_encoding('UTF-8'), # ó
"\xC4\x82\xC2\xBA".force_encoding('UTF-8') => "\xC3\xBA".force_encoding('UTF-8'), # ú
# Smart quotes and apostrophes
"\xC3\xA2\xE2\x82\xAC\xE2\x84\xA2".force_encoding('UTF-8') => "\xE2\x80\x99".force_encoding('UTF-8'), # '
"\xC3\xA2\xE2\x82\xAC\xC5\x93".force_encoding('UTF-8') => "\xE2\x80\x9C".force_encoding('UTF-8'), # "
"\xC3\xA2\xE2\x82\xAC\xC2\x9D".force_encoding('UTF-8') => "\xE2\x80\x9D".force_encoding('UTF-8'), # "
"\xC3\xA2\xE2\x82\xAC\xCB\x9C".force_encoding('UTF-8') => "\xE2\x80\x98".force_encoding('UTF-8'), # '
# Other punctuation
"\xC3\xA2\xE2\x82\xAC\xE2\x80\x9C".force_encoding('UTF-8') => "\xE2\x80\x93".force_encoding('UTF-8'), #
"\xC3\xA2\xE2\x82\xAC\xE2\x80\x9D".force_encoding('UTF-8') => "\xE2\x80\x94".force_encoding('UTF-8'), # —
"\xC3\xA2\xE2\x82\xAC\xC2\xA6".force_encoding('UTF-8') => "\xE2\x80\xA6".force_encoding('UTF-8'), # …
# Non-breaking space
"\xC3\x82\xC2\xA0".force_encoding('UTF-8') => "\xC2\xA0".force_encoding('UTF-8'),
}
puts "Will fix the following patterns:"
encoding_fixes.each do |bad, good|
puts " #{bad.inspect}#{good.inspect}"
end
puts
# Find affected items by actually checking for the pattern in Ruby
# (MySQL LIKE queries give false positives with multi-byte UTF-8)
puts "Scanning items for double-encoding patterns..."
items_by_pattern = {}
total_affected = Set.new
count_by_pattern = Hash.new { |h, k| h[k] = { name: 0, description: 0 } }
Item.find_each do |item|
encoding_fixes.each_key do |pattern|
if item.name.include?(pattern)
items_by_pattern[pattern] ||= { name: [], description: [] }
items_by_pattern[pattern][:name] << item.id
total_affected << item.id
count_by_pattern[pattern][:name] += 1
end
if item.description.include?(pattern)
items_by_pattern[pattern] ||= { name: [], description: [] }
items_by_pattern[pattern][:description] << item.id
total_affected << item.id
count_by_pattern[pattern][:description] += 1
end
end
end
puts
count_by_pattern.each do |pattern, counts|
puts "#{pattern.inspect}: #{counts[:name]} names, #{counts[:description]} descriptions"
end
puts
puts "Total affected items: #{total_affected.size}"
puts
if total_affected.empty?
puts "No items need fixing!"
next
end
# Show some examples
puts "Example affected items:"
Item.where(id: total_affected.to_a.first(5)).each do |item|
puts " #{item.id}: #{item.name}"
end
puts " ... and #{total_affected.size - 5} more" if total_affected.size > 5
puts
# Ask for confirmation
print "Fix these items by replacing double-encoded characters? (y/N): "
response = STDIN.gets.chomp
unless response.downcase == 'y'
puts "Aborted."
next
end
puts
puts "Fixing items..."
puts "-" * 80
fixed_count = 0
no_change_count = 0
Item.where(id: total_affected.to_a).find_each do |item|
original_name = item.name
original_description = item.description
# Apply all fixes to name and description
new_name = original_name.dup
new_description = original_description.dup
encoding_fixes.each do |bad, good|
new_name.gsub!(bad, good)
new_description.gsub!(bad, good)
end
# Only save if something changed
if new_name != original_name || new_description != original_description
item.name = new_name
item.description = new_description
item.save!(validate: false) # Skip validations to avoid potential issues
if new_name != original_name
puts "#{item.id}: #{original_name.inspect}#{new_name.inspect}"
elsif new_description != original_description
puts "#{item.id}: Updated description only"
end
fixed_count += 1
else
no_change_count += 1
end
end
puts
puts "-" * 80
puts "Complete!"
puts " ✓ Fixed: #{fixed_count}"
puts " ⊘ No changes needed: #{no_change_count}"
puts
# Show a sample of fixed items
puts "Sample of fixed items:"
Item.where(id: total_affected.to_a.first(5)).each do |item|
puts " #{item.id}: #{item.name}"
end
puts
end
end