From 83281591b3c3c3969a94479498f2c27e68646336 Mon Sep 17 00:00:00 2001 From: Emi Matchu Date: Sun, 4 Jan 2026 19:20:31 -0800 Subject: [PATCH] Fix inconsistent item string encoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We discovered a previous string encoding bug fix that was causing crashes for some items, was casuing *other* items to get reencoded incorrectly. In this change, we make the reencoding conditional, only if parsing as UTF-8 is failing. We also include a temporary repair script, to run in production then delete—but held here in git history for posterity. --- .../remote_gateway/request.rb | 35 ++-- lib/tasks/fix_encoding.rake | 150 ++++++++++++++++++ 2 files changed, 166 insertions(+), 19 deletions(-) create mode 100644 lib/tasks/fix_encoding.rake diff --git a/lib/rocketamf_extensions/remote_gateway/request.rb b/lib/rocketamf_extensions/remote_gateway/request.rb index 81a7e484..66db5688 100644 --- a/lib/rocketamf_extensions/remote_gateway/request.rb +++ b/lib/rocketamf_extensions/remote_gateway/request.rb @@ -30,24 +30,17 @@ module RocketAMFExtensions raise RocketAMF::AMFError.new(first_message_data) end - # HACK: It seems to me that these messages come back with Windows-1250 - # (or similar) encoding on the strings? I'm basing this on the - # Patchwork Staff item, whose description arrives as: + # HACK: Older items in Neopets' database have Windows-1250 encoding, + # while newer items use proper UTF-8. We detect which encoding was used + # by checking if the string is valid UTF-8, and only re-encode if needed. # - # "That staff is cute, but dont use it as a walking stick \x96 I " + - # "dont think it will hold you up!" + # Example of Windows-1250 item: Patchwork Staff (57311), whose + # description contains byte 0x96 (en-dash in Windows-1250). # - # And the `\x96` is meant to represent an endash, which it doesn't in - # UTF-8 or in most extended ASCII encodings, but *does* in Windows's - # specific extended ASCII. - # - # Idk if this is something to do with the AMFPHP spec or how the AMFPHP - # server code they use serializes strings (I couldn't find any - # reference to it?), or just their internal database encoding being - # passed along as-is, or what? But this seems to be the most correct - # interpretation I know how to do, so, let's do it! + # Example of UTF-8 item: Carnival Party Décor (80042), whose name + # contains proper UTF-8 bytes [195, 169] for the é character. result.messages[0].data.body.tap do |body| - reencode_strings! body, "Windows-1250", "UTF-8" + reencode_strings_if_needed! body, "Windows-1250", "UTF-8" end end @@ -92,13 +85,17 @@ module RocketAMFExtensions end end - def reencode_strings!(target, from, to) + def reencode_strings_if_needed!(target, from, to) if target.is_a? String - target.force_encoding(from).encode!(to) + # Only re-encode if the string is not valid UTF-8 + # (indicating it's in the old Windows-1250 encoding) + unless target.valid_encoding? + target.force_encoding(from).encode!(to) + end elsif target.is_a? Array - target.each { |x| reencode_strings!(x, from, to) } + target.each { |x| reencode_strings_if_needed!(x, from, to) } elsif target.is_a? Hash - target.values.each { |x| reencode_strings!(x, from, to) } + target.values.each { |x| reencode_strings_if_needed!(x, from, to) } end end end diff --git a/lib/tasks/fix_encoding.rake b/lib/tasks/fix_encoding.rake new file mode 100644 index 00000000..e9cb2883 --- /dev/null +++ b/lib/tasks/fix_encoding.rake @@ -0,0 +1,150 @@ +namespace :db do + desc "Fix double-encoded UTF-8 strings in item names and descriptions" + task fix_double_encoding: :environment do + puts "=" * 80 + puts "Fix Double-Encoded Strings in Database" + puts "=" * 80 + puts + + # Define the double-encoding patterns and their fixes + # Each pattern maps: double-encoded string -> correct UTF-8 string + # Using byte arrays to avoid encoding issues in the source file itself + encoding_fixes = { + # Common accented characters (Ă© => é, etc.) + "\xC4\x82\xC2\xA9".force_encoding('UTF-8') => "\xC3\xA9".force_encoding('UTF-8'), # é + "\xC4\x82\xC2\xB1".force_encoding('UTF-8') => "\xC3\xB1".force_encoding('UTF-8'), # ñ + "\xC4\x82\xC2\xAD".force_encoding('UTF-8') => "\xC3\xAD".force_encoding('UTF-8'), # í + "\xC4\x82\xC2\xA1".force_encoding('UTF-8') => "\xC3\xA1".force_encoding('UTF-8'), # á + "\xC4\x82\xC2\xB3".force_encoding('UTF-8') => "\xC3\xB3".force_encoding('UTF-8'), # ó + "\xC4\x82\xC2\xBA".force_encoding('UTF-8') => "\xC3\xBA".force_encoding('UTF-8'), # ú + + # Smart quotes and apostrophes + "\xC3\xA2\xE2\x82\xAC\xE2\x84\xA2".force_encoding('UTF-8') => "\xE2\x80\x99".force_encoding('UTF-8'), # ' + "\xC3\xA2\xE2\x82\xAC\xC5\x93".force_encoding('UTF-8') => "\xE2\x80\x9C".force_encoding('UTF-8'), # " + "\xC3\xA2\xE2\x82\xAC\xC2\x9D".force_encoding('UTF-8') => "\xE2\x80\x9D".force_encoding('UTF-8'), # " + "\xC3\xA2\xE2\x82\xAC\xCB\x9C".force_encoding('UTF-8') => "\xE2\x80\x98".force_encoding('UTF-8'), # ' + + # Other punctuation + "\xC3\xA2\xE2\x82\xAC\xE2\x80\x9C".force_encoding('UTF-8') => "\xE2\x80\x93".force_encoding('UTF-8'), # – + "\xC3\xA2\xE2\x82\xAC\xE2\x80\x9D".force_encoding('UTF-8') => "\xE2\x80\x94".force_encoding('UTF-8'), # — + "\xC3\xA2\xE2\x82\xAC\xC2\xA6".force_encoding('UTF-8') => "\xE2\x80\xA6".force_encoding('UTF-8'), # … + + # Non-breaking space + "\xC3\x82\xC2\xA0".force_encoding('UTF-8') => "\xC2\xA0".force_encoding('UTF-8'), + } + + puts "Will fix the following patterns:" + encoding_fixes.each do |bad, good| + puts " #{bad.inspect} → #{good.inspect}" + end + puts + + # Find affected items by actually checking for the pattern in Ruby + # (MySQL LIKE queries give false positives with multi-byte UTF-8) + puts "Scanning items for double-encoding patterns..." + + items_by_pattern = {} + total_affected = Set.new + count_by_pattern = Hash.new { |h, k| h[k] = { name: 0, description: 0 } } + + Item.find_each do |item| + encoding_fixes.each_key do |pattern| + if item.name.include?(pattern) + items_by_pattern[pattern] ||= { name: [], description: [] } + items_by_pattern[pattern][:name] << item.id + total_affected << item.id + count_by_pattern[pattern][:name] += 1 + end + + if item.description.include?(pattern) + items_by_pattern[pattern] ||= { name: [], description: [] } + items_by_pattern[pattern][:description] << item.id + total_affected << item.id + count_by_pattern[pattern][:description] += 1 + end + end + end + + puts + count_by_pattern.each do |pattern, counts| + puts "#{pattern.inspect}: #{counts[:name]} names, #{counts[:description]} descriptions" + end + + puts + puts "Total affected items: #{total_affected.size}" + puts + + if total_affected.empty? + puts "No items need fixing!" + next + end + + # Show some examples + puts "Example affected items:" + Item.where(id: total_affected.to_a.first(5)).each do |item| + puts " #{item.id}: #{item.name}" + end + puts " ... and #{total_affected.size - 5} more" if total_affected.size > 5 + puts + + # Ask for confirmation + print "Fix these items by replacing double-encoded characters? (y/N): " + response = STDIN.gets.chomp + unless response.downcase == 'y' + puts "Aborted." + next + end + + puts + puts "Fixing items..." + puts "-" * 80 + + fixed_count = 0 + no_change_count = 0 + + Item.where(id: total_affected.to_a).find_each do |item| + original_name = item.name + original_description = item.description + + # Apply all fixes to name and description + new_name = original_name.dup + new_description = original_description.dup + + encoding_fixes.each do |bad, good| + new_name.gsub!(bad, good) + new_description.gsub!(bad, good) + end + + # Only save if something changed + if new_name != original_name || new_description != original_description + item.name = new_name + item.description = new_description + item.save!(validate: false) # Skip validations to avoid potential issues + + if new_name != original_name + puts "#{item.id}: #{original_name.inspect} → #{new_name.inspect}" + elsif new_description != original_description + puts "#{item.id}: Updated description only" + end + + fixed_count += 1 + else + no_change_count += 1 + end + end + + puts + puts "-" * 80 + puts "Complete!" + puts " ✓ Fixed: #{fixed_count}" + puts " ⊘ No changes needed: #{no_change_count}" + puts + + # Show a sample of fixed items + puts "Sample of fixed items:" + Item.where(id: total_affected.to_a.first(5)).each do |item| + puts " #{item.id}: #{item.name}" + end + puts + end +end