forked from OpenNeo/impress
151 lines
5.3 KiB
Ruby
151 lines
5.3 KiB
Ruby
|
|
namespace :db do
|
|||
|
|
desc "Fix double-encoded UTF-8 strings in item names and descriptions"
|
|||
|
|
task fix_double_encoding: :environment do
|
|||
|
|
puts "=" * 80
|
|||
|
|
puts "Fix Double-Encoded Strings in Database"
|
|||
|
|
puts "=" * 80
|
|||
|
|
puts
|
|||
|
|
|
|||
|
|
# Define the double-encoding patterns and their fixes
|
|||
|
|
# Each pattern maps: double-encoded string -> correct UTF-8 string
|
|||
|
|
# Using byte arrays to avoid encoding issues in the source file itself
|
|||
|
|
encoding_fixes = {
|
|||
|
|
# Common accented characters (Ă© => é, etc.)
|
|||
|
|
"\xC4\x82\xC2\xA9".force_encoding('UTF-8') => "\xC3\xA9".force_encoding('UTF-8'), # é
|
|||
|
|
"\xC4\x82\xC2\xB1".force_encoding('UTF-8') => "\xC3\xB1".force_encoding('UTF-8'), # ñ
|
|||
|
|
"\xC4\x82\xC2\xAD".force_encoding('UTF-8') => "\xC3\xAD".force_encoding('UTF-8'), # í
|
|||
|
|
"\xC4\x82\xC2\xA1".force_encoding('UTF-8') => "\xC3\xA1".force_encoding('UTF-8'), # á
|
|||
|
|
"\xC4\x82\xC2\xB3".force_encoding('UTF-8') => "\xC3\xB3".force_encoding('UTF-8'), # ó
|
|||
|
|
"\xC4\x82\xC2\xBA".force_encoding('UTF-8') => "\xC3\xBA".force_encoding('UTF-8'), # ú
|
|||
|
|
|
|||
|
|
# Smart quotes and apostrophes
|
|||
|
|
"\xC3\xA2\xE2\x82\xAC\xE2\x84\xA2".force_encoding('UTF-8') => "\xE2\x80\x99".force_encoding('UTF-8'), # '
|
|||
|
|
"\xC3\xA2\xE2\x82\xAC\xC5\x93".force_encoding('UTF-8') => "\xE2\x80\x9C".force_encoding('UTF-8'), # "
|
|||
|
|
"\xC3\xA2\xE2\x82\xAC\xC2\x9D".force_encoding('UTF-8') => "\xE2\x80\x9D".force_encoding('UTF-8'), # "
|
|||
|
|
"\xC3\xA2\xE2\x82\xAC\xCB\x9C".force_encoding('UTF-8') => "\xE2\x80\x98".force_encoding('UTF-8'), # '
|
|||
|
|
|
|||
|
|
# Other punctuation
|
|||
|
|
"\xC3\xA2\xE2\x82\xAC\xE2\x80\x9C".force_encoding('UTF-8') => "\xE2\x80\x93".force_encoding('UTF-8'), # –
|
|||
|
|
"\xC3\xA2\xE2\x82\xAC\xE2\x80\x9D".force_encoding('UTF-8') => "\xE2\x80\x94".force_encoding('UTF-8'), # —
|
|||
|
|
"\xC3\xA2\xE2\x82\xAC\xC2\xA6".force_encoding('UTF-8') => "\xE2\x80\xA6".force_encoding('UTF-8'), # …
|
|||
|
|
|
|||
|
|
# Non-breaking space
|
|||
|
|
"\xC3\x82\xC2\xA0".force_encoding('UTF-8') => "\xC2\xA0".force_encoding('UTF-8'),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
puts "Will fix the following patterns:"
|
|||
|
|
encoding_fixes.each do |bad, good|
|
|||
|
|
puts " #{bad.inspect} → #{good.inspect}"
|
|||
|
|
end
|
|||
|
|
puts
|
|||
|
|
|
|||
|
|
# Find affected items by actually checking for the pattern in Ruby
|
|||
|
|
# (MySQL LIKE queries give false positives with multi-byte UTF-8)
|
|||
|
|
puts "Scanning items for double-encoding patterns..."
|
|||
|
|
|
|||
|
|
items_by_pattern = {}
|
|||
|
|
total_affected = Set.new
|
|||
|
|
count_by_pattern = Hash.new { |h, k| h[k] = { name: 0, description: 0 } }
|
|||
|
|
|
|||
|
|
Item.find_each do |item|
|
|||
|
|
encoding_fixes.each_key do |pattern|
|
|||
|
|
if item.name.include?(pattern)
|
|||
|
|
items_by_pattern[pattern] ||= { name: [], description: [] }
|
|||
|
|
items_by_pattern[pattern][:name] << item.id
|
|||
|
|
total_affected << item.id
|
|||
|
|
count_by_pattern[pattern][:name] += 1
|
|||
|
|
end
|
|||
|
|
|
|||
|
|
if item.description.include?(pattern)
|
|||
|
|
items_by_pattern[pattern] ||= { name: [], description: [] }
|
|||
|
|
items_by_pattern[pattern][:description] << item.id
|
|||
|
|
total_affected << item.id
|
|||
|
|
count_by_pattern[pattern][:description] += 1
|
|||
|
|
end
|
|||
|
|
end
|
|||
|
|
end
|
|||
|
|
|
|||
|
|
puts
|
|||
|
|
count_by_pattern.each do |pattern, counts|
|
|||
|
|
puts "#{pattern.inspect}: #{counts[:name]} names, #{counts[:description]} descriptions"
|
|||
|
|
end
|
|||
|
|
|
|||
|
|
puts
|
|||
|
|
puts "Total affected items: #{total_affected.size}"
|
|||
|
|
puts
|
|||
|
|
|
|||
|
|
if total_affected.empty?
|
|||
|
|
puts "No items need fixing!"
|
|||
|
|
next
|
|||
|
|
end
|
|||
|
|
|
|||
|
|
# Show some examples
|
|||
|
|
puts "Example affected items:"
|
|||
|
|
Item.where(id: total_affected.to_a.first(5)).each do |item|
|
|||
|
|
puts " #{item.id}: #{item.name}"
|
|||
|
|
end
|
|||
|
|
puts " ... and #{total_affected.size - 5} more" if total_affected.size > 5
|
|||
|
|
puts
|
|||
|
|
|
|||
|
|
# Ask for confirmation
|
|||
|
|
print "Fix these items by replacing double-encoded characters? (y/N): "
|
|||
|
|
response = STDIN.gets.chomp
|
|||
|
|
unless response.downcase == 'y'
|
|||
|
|
puts "Aborted."
|
|||
|
|
next
|
|||
|
|
end
|
|||
|
|
|
|||
|
|
puts
|
|||
|
|
puts "Fixing items..."
|
|||
|
|
puts "-" * 80
|
|||
|
|
|
|||
|
|
fixed_count = 0
|
|||
|
|
no_change_count = 0
|
|||
|
|
|
|||
|
|
Item.where(id: total_affected.to_a).find_each do |item|
|
|||
|
|
original_name = item.name
|
|||
|
|
original_description = item.description
|
|||
|
|
|
|||
|
|
# Apply all fixes to name and description
|
|||
|
|
new_name = original_name.dup
|
|||
|
|
new_description = original_description.dup
|
|||
|
|
|
|||
|
|
encoding_fixes.each do |bad, good|
|
|||
|
|
new_name.gsub!(bad, good)
|
|||
|
|
new_description.gsub!(bad, good)
|
|||
|
|
end
|
|||
|
|
|
|||
|
|
# Only save if something changed
|
|||
|
|
if new_name != original_name || new_description != original_description
|
|||
|
|
item.name = new_name
|
|||
|
|
item.description = new_description
|
|||
|
|
item.save!(validate: false) # Skip validations to avoid potential issues
|
|||
|
|
|
|||
|
|
if new_name != original_name
|
|||
|
|
puts "#{item.id}: #{original_name.inspect} → #{new_name.inspect}"
|
|||
|
|
elsif new_description != original_description
|
|||
|
|
puts "#{item.id}: Updated description only"
|
|||
|
|
end
|
|||
|
|
|
|||
|
|
fixed_count += 1
|
|||
|
|
else
|
|||
|
|
no_change_count += 1
|
|||
|
|
end
|
|||
|
|
end
|
|||
|
|
|
|||
|
|
puts
|
|||
|
|
puts "-" * 80
|
|||
|
|
puts "Complete!"
|
|||
|
|
puts " ✓ Fixed: #{fixed_count}"
|
|||
|
|
puts " ⊘ No changes needed: #{no_change_count}"
|
|||
|
|
puts
|
|||
|
|
|
|||
|
|
# Show a sample of fixed items
|
|||
|
|
puts "Sample of fixed items:"
|
|||
|
|
Item.where(id: total_affected.to_a.first(5)).each do |item|
|
|||
|
|
puts " #{item.id}: #{item.name}"
|
|||
|
|
end
|
|||
|
|
puts
|
|||
|
|
end
|
|||
|
|
end
|