impress/lib/tasks/fix_encoding.rake

namespace :db do
  desc "Fix double-encoded UTF-8 strings in item names and descriptions"
  task fix_double_encoding: :environment do
    puts "=" * 80
    puts "Fix Double-Encoded Strings in Database"
    puts "=" * 80
    puts

    # Define the double-encoding patterns and their fixes
    # Each pattern maps: double-encoded string -> correct UTF-8 string
    # Using byte arrays to avoid encoding issues in the source file itself
    encoding_fixes = {
      # Common accented characters (Ă© => é, etc.)
      "\xC4\x82\xC2\xA9".force_encoding('UTF-8') => "\xC3\xA9".force_encoding('UTF-8'),  # é
      "\xC4\x82\xC2\xB1".force_encoding('UTF-8') => "\xC3\xB1".force_encoding('UTF-8'),  # ñ
      "\xC4\x82\xC2\xAD".force_encoding('UTF-8') => "\xC3\xAD".force_encoding('UTF-8'),  # í
      "\xC4\x82\xC2\xA1".force_encoding('UTF-8') => "\xC3\xA1".force_encoding('UTF-8'),  # á
      "\xC4\x82\xC2\xB3".force_encoding('UTF-8') => "\xC3\xB3".force_encoding('UTF-8'),  # ó
      "\xC4\x82\xC2\xBA".force_encoding('UTF-8') => "\xC3\xBA".force_encoding('UTF-8'),  # ú

      # Smart quotes and apostrophes
      "\xC3\xA2\xE2\x82\xAC\xE2\x84\xA2".force_encoding('UTF-8') => "\xE2\x80\x99".force_encoding('UTF-8'),  # '
      "\xC3\xA2\xE2\x82\xAC\xC5\x93".force_encoding('UTF-8') => "\xE2\x80\x9C".force_encoding('UTF-8'),  # "
      "\xC3\xA2\xE2\x82\xAC\xC2\x9D".force_encoding('UTF-8') => "\xE2\x80\x9D".force_encoding('UTF-8'),  # "
      "\xC3\xA2\xE2\x82\xAC\xCB\x9C".force_encoding('UTF-8') => "\xE2\x80\x98".force_encoding('UTF-8'),  # '

      # Other punctuation
      "\xC3\xA2\xE2\x82\xAC\xE2\x80\x9C".force_encoding('UTF-8') => "\xE2\x80\x93".force_encoding('UTF-8'),  # –
      "\xC3\xA2\xE2\x82\xAC\xE2\x80\x9D".force_encoding('UTF-8') => "\xE2\x80\x94".force_encoding('UTF-8'),  # —
      "\xC3\xA2\xE2\x82\xAC\xC2\xA6".force_encoding('UTF-8') => "\xE2\x80\xA6".force_encoding('UTF-8'),  # …

      # Non-breaking space
      "\xC3\x82\xC2\xA0".force_encoding('UTF-8') => "\xC2\xA0".force_encoding('UTF-8'),
    }

    puts "Will fix the following patterns:"
    encoding_fixes.each do |bad, good|
      puts "  #{bad.inspect} → #{good.inspect}"
    end
    puts

    # Find affected items by actually checking for the pattern in Ruby
    # (MySQL LIKE queries give false positives with multi-byte UTF-8)
    puts "Scanning items for double-encoding patterns..."

    items_by_pattern = {}
    total_affected = Set.new
    count_by_pattern = Hash.new { |h, k| h[k] = { name: 0, description: 0 } }

    Item.find_each do |item|
      encoding_fixes.each_key do |pattern|
        if item.name.include?(pattern)
          items_by_pattern[pattern] ||= { name: [], description: [] }
          items_by_pattern[pattern][:name] << item.id
          total_affected << item.id
          count_by_pattern[pattern][:name] += 1
        end

        if item.description.include?(pattern)
          items_by_pattern[pattern] ||= { name: [], description: [] }
          items_by_pattern[pattern][:description] << item.id
          total_affected << item.id
          count_by_pattern[pattern][:description] += 1
        end
      end
    end

    puts
    count_by_pattern.each do |pattern, counts|
      puts "#{pattern.inspect}: #{counts[:name]} names, #{counts[:description]} descriptions"
    end

    puts
    puts "Total affected items: #{total_affected.size}"
    puts

    if total_affected.empty?
      puts "No items need fixing!"
      next
    end

    # Show some examples
    puts "Example affected items:"
    Item.where(id: total_affected.to_a.first(5)).each do |item|
      puts "  #{item.id}: #{item.name}"
    end
    puts "  ... and #{total_affected.size - 5} more" if total_affected.size > 5
    puts

    # Ask for confirmation
    print "Fix these items by replacing double-encoded characters? (y/N): "
    response = STDIN.gets.chomp
    unless response.downcase == 'y'
      puts "Aborted."
      next
    end

    puts
    puts "Fixing items..."
    puts "-" * 80

    fixed_count = 0
    no_change_count = 0

    Item.where(id: total_affected.to_a).find_each do |item|
      original_name = item.name
      original_description = item.description

      # Apply all fixes to name and description
      new_name = original_name.dup
      new_description = original_description.dup

      encoding_fixes.each do |bad, good|
        new_name.gsub!(bad, good)
        new_description.gsub!(bad, good)
      end

      # Only save if something changed
      if new_name != original_name || new_description != original_description
        item.name = new_name
        item.description = new_description
        item.save!(validate: false) # Skip validations to avoid potential issues

        if new_name != original_name
          puts "#{item.id}: #{original_name.inspect} → #{new_name.inspect}"
        elsif new_description != original_description
          puts "#{item.id}: Updated description only"
        end

        fixed_count += 1
      else
        no_change_count += 1
      end
    end

    puts
    puts "-" * 80
    puts "Complete!"
    puts "  ✓ Fixed: #{fixed_count}"
    puts "  ⊘ No changes needed: #{no_change_count}"
    puts

    # Show a sample of fixed items
    puts "Sample of fixed items:"
    Item.where(id: total_affected.to_a.first(5)).each do |item|
      puts "  #{item.id}: #{item.name}"
    end
    puts
  end
end