Fix inconsistent item string encoding

We discovered a previous string encoding bug fix that was causing crashes for some items, was casuing *other* items to get reencoded incorrectly. In this change, we make the reencoding conditional, only if parsing as UTF-8 is failing. We also include a temporary repair script, to run in production then delete—but held here in git history for posterity.
2026-01-04 19:20:31 -08:00 · 2026-01-04 19:20:31 -08:00 · 83281591b3
commit 83281591b3
parent 7430e12655
2 changed files with 166 additions and 19 deletions
--- a/lib/rocketamf_extensions/remote_gateway/request.rb
+++ b/lib/rocketamf_extensions/remote_gateway/request.rb
@ -30,24 +30,17 @@ module RocketAMFExtensions
          raise RocketAMF::AMFError.new(first_message_data)
        end
-        # HACK: It seems to me that these messages come back with Windows-1250
+        # HACK: Older items in Neopets' database have Windows-1250 encoding,
-        # (or similar) encoding on the strings? I'm basing this on the
+        # while newer items use proper UTF-8. We detect which encoding was used
-        # Patchwork Staff item, whose description arrives as:
+        # by checking if the string is valid UTF-8, and only re-encode if needed.
        #
-        # "That staff is cute, but dont use it as a walking stick \x96 I " +
+        # Example of Windows-1250 item: Patchwork Staff (57311), whose
-        # "dont think it will hold you up!"
+        # description contains byte 0x96 (en-dash in Windows-1250).
        #
-        # And the `\x96` is meant to represent an endash, which it doesn't in
+        # Example of UTF-8 item: Carnival Party Décor (80042), whose name
-        # UTF-8 or in most extended ASCII encodings, but *does* in Windows's
+        # contains proper UTF-8 bytes [195, 169] for the é character.
        # specific extended ASCII.
        #
        # Idk if this is something to do with the AMFPHP spec or how the AMFPHP
        # server code they use serializes strings (I couldn't find any
        # reference to it?), or just their internal database encoding being
        # passed along as-is, or what? But this seems to be the most correct
        # interpretation I know how to do, so, let's do it!
        result.messages[0].data.body.tap do |body|
-          reencode_strings! body, "Windows-1250", "UTF-8"
+          reencode_strings_if_needed! body, "Windows-1250", "UTF-8"
        end
      end
@ -92,13 +85,17 @@ module RocketAMFExtensions
        end
      end
-      def reencode_strings!(target, from, to)
+      def reencode_strings_if_needed!(target, from, to)
        if target.is_a? String
-          target.force_encoding(from).encode!(to)
+          # Only re-encode if the string is not valid UTF-8
          # (indicating it's in the old Windows-1250 encoding)
          unless target.valid_encoding?
            target.force_encoding(from).encode!(to)
          end
        elsif target.is_a? Array
-          target.each { |x| reencode_strings!(x, from, to) }
+          target.each { |x| reencode_strings_if_needed!(x, from, to) }
        elsif target.is_a? Hash
-          target.values.each { |x| reencode_strings!(x, from, to) }
+          target.values.each { |x| reencode_strings_if_needed!(x, from, to) }
        end
      end
    end
--- a/lib/tasks/fix_encoding.rake
+++ b/lib/tasks/fix_encoding.rake
@ -0,0 +1,150 @@
 namespace :db do
  desc "Fix double-encoded UTF-8 strings in item names and descriptions"
  task fix_double_encoding: :environment do
    puts "=" * 80
    puts "Fix Double-Encoded Strings in Database"
    puts "=" * 80
    puts
    # Define the double-encoding patterns and their fixes
    # Each pattern maps: double-encoded string -> correct UTF-8 string
    # Using byte arrays to avoid encoding issues in the source file itself
    encoding_fixes = {
      # Common accented characters (Ă© => é, etc.)
      "\xC4\x82\xC2\xA9".force_encoding('UTF-8') => "\xC3\xA9".force_encoding('UTF-8'),  # é
      "\xC4\x82\xC2\xB1".force_encoding('UTF-8') => "\xC3\xB1".force_encoding('UTF-8'),  # ñ
      "\xC4\x82\xC2\xAD".force_encoding('UTF-8') => "\xC3\xAD".force_encoding('UTF-8'),  # í
      "\xC4\x82\xC2\xA1".force_encoding('UTF-8') => "\xC3\xA1".force_encoding('UTF-8'),  # á
      "\xC4\x82\xC2\xB3".force_encoding('UTF-8') => "\xC3\xB3".force_encoding('UTF-8'),  # ó
      "\xC4\x82\xC2\xBA".force_encoding('UTF-8') => "\xC3\xBA".force_encoding('UTF-8'),  # ú
      # Smart quotes and apostrophes
      "\xC3\xA2\xE2\x82\xAC\xE2\x84\xA2".force_encoding('UTF-8') => "\xE2\x80\x99".force_encoding('UTF-8'),  # '
      "\xC3\xA2\xE2\x82\xAC\xC5\x93".force_encoding('UTF-8') => "\xE2\x80\x9C".force_encoding('UTF-8'),  # "
      "\xC3\xA2\xE2\x82\xAC\xC2\x9D".force_encoding('UTF-8') => "\xE2\x80\x9D".force_encoding('UTF-8'),  # "
      "\xC3\xA2\xE2\x82\xAC\xCB\x9C".force_encoding('UTF-8') => "\xE2\x80\x98".force_encoding('UTF-8'),  # '
      # Other punctuation
      "\xC3\xA2\xE2\x82\xAC\xE2\x80\x9C".force_encoding('UTF-8') => "\xE2\x80\x93".force_encoding('UTF-8'),  # –
      "\xC3\xA2\xE2\x82\xAC\xE2\x80\x9D".force_encoding('UTF-8') => "\xE2\x80\x94".force_encoding('UTF-8'),  # —
      "\xC3\xA2\xE2\x82\xAC\xC2\xA6".force_encoding('UTF-8') => "\xE2\x80\xA6".force_encoding('UTF-8'),  # …
      # Non-breaking space
      "\xC3\x82\xC2\xA0".force_encoding('UTF-8') => "\xC2\xA0".force_encoding('UTF-8'),
    }
    puts "Will fix the following patterns:"
    encoding_fixes.each do |bad, good|
      puts "  #{bad.inspect} → #{good.inspect}"
    end
    puts
    # Find affected items by actually checking for the pattern in Ruby
    # (MySQL LIKE queries give false positives with multi-byte UTF-8)
    puts "Scanning items for double-encoding patterns..."
    items_by_pattern = {}
    total_affected = Set.new
    count_by_pattern = Hash.new { |h, k| h[k] = { name: 0, description: 0 } }
    Item.find_each do |item|
      encoding_fixes.each_key do |pattern|
        if item.name.include?(pattern)
          items_by_pattern[pattern] ||= { name: [], description: [] }
          items_by_pattern[pattern][:name] << item.id
          total_affected << item.id
          count_by_pattern[pattern][:name] += 1
        end
        if item.description.include?(pattern)
          items_by_pattern[pattern] ||= { name: [], description: [] }
          items_by_pattern[pattern][:description] << item.id
          total_affected << item.id
          count_by_pattern[pattern][:description] += 1
        end
      end
    end
    puts
    count_by_pattern.each do |pattern, counts|
      puts "#{pattern.inspect}: #{counts[:name]} names, #{counts[:description]} descriptions"
    end
    puts
    puts "Total affected items: #{total_affected.size}"
    puts
    if total_affected.empty?
      puts "No items need fixing!"
      next
    end
    # Show some examples
    puts "Example affected items:"
    Item.where(id: total_affected.to_a.first(5)).each do |item|
      puts "  #{item.id}: #{item.name}"
    end
    puts "  ... and #{total_affected.size - 5} more" if total_affected.size > 5
    puts
    # Ask for confirmation
    print "Fix these items by replacing double-encoded characters? (y/N): "
    response = STDIN.gets.chomp
    unless response.downcase == 'y'
      puts "Aborted."
      next
    end
    puts
    puts "Fixing items..."
    puts "-" * 80
    fixed_count = 0
    no_change_count = 0
    Item.where(id: total_affected.to_a).find_each do |item|
      original_name = item.name
      original_description = item.description
      # Apply all fixes to name and description
      new_name = original_name.dup
      new_description = original_description.dup
      encoding_fixes.each do |bad, good|
        new_name.gsub!(bad, good)
        new_description.gsub!(bad, good)
      end
      # Only save if something changed
      if new_name != original_name || new_description != original_description
        item.name = new_name
        item.description = new_description
        item.save!(validate: false) # Skip validations to avoid potential issues
        if new_name != original_name
          puts "#{item.id}: #{original_name.inspect} → #{new_name.inspect}"
        elsif new_description != original_description
          puts "#{item.id}: Updated description only"
        end
        fixed_count += 1
      else
        no_change_count += 1
      end
    end
    puts
    puts "-" * 80
    puts "Complete!"
    puts "  ✓ Fixed: #{fixed_count}"
    puts "  ⊘ No changes needed: #{no_change_count}"
    puts
    # Show a sample of fixed items
    puts "Sample of fixed items:"
    Item.where(id: total_affected.to_a.first(5)).each do |item|
      puts "  #{item.id}: #{item.name}"
    end
    puts
  end
 end