add rake items:mall_spider

includes allowing null on some item fields, and putting the swf_assets
type and id index in an actual migration, or this commit would have removed
it upon migrating
This commit is contained in:
Emi Matchu 2010-11-25 11:10:21 -05:00
parent e2d48a67df
commit d63da31ac2
7 changed files with 144 additions and 14 deletions

View file

@ -18,6 +18,8 @@ gem 'jammit', '~> 0.5.3'
gem 'hoptoad_notifier' gem 'hoptoad_notifier'
gem 'addressable', :require => ['addressable/template', 'addressable/uri']
group :development_async, :production do group :development_async, :production do
# async wrappers # async wrappers
gem 'eventmachine', :git => 'git://github.com/eventmachine/eventmachine.git' gem 'eventmachine', :git => 'git://github.com/eventmachine/eventmachine.git'
@ -33,7 +35,6 @@ group :development_async, :production do
# async http requires # async http requires
gem 'em-http-request',:git => 'git://github.com/igrigorik/em-http-request.git', :require => 'em-http' gem 'em-http-request',:git => 'git://github.com/igrigorik/em-http-request.git', :require => 'em-http'
gem 'addressable', :require => 'addressable/uri'
gem 'thin', '~> 1.2.7' gem 'thin', '~> 1.2.7'
end end

View file

@ -116,7 +116,7 @@ class Item < ActiveRecord::Base
end end
before_create do before_create do
self.sold_in_mall = false self.sold_in_mall ||= false
true true
end end
@ -261,6 +261,93 @@ class Item < ActiveRecord::Base
items.values items.values
end end
class << self
MALL_HOST = 'ncmall.neopets.com'
MALL_MAIN_PATH = '/mall/shop.phtml'
MALL_CATEGORY_PATH = '/mall/ajax/load_page.phtml?type=browse&cat={cat}&lang=en'
MALL_CATEGORY_TRIGGER = /load_items_pane\("browse", ([0-9]+)\);/
MALL_JSON_ITEM_DATA_KEY = 'object_data'
MALL_ITEM_URL_TEMPLATE = 'http://images.neopets.com/items/%s.gif'
MALL_MAIN_URI = Addressable::URI.new :scheme => 'http',
:host => MALL_HOST, :path => MALL_MAIN_PATH
MALL_CATEGORY_URI = Addressable::URI.new :scheme => 'http',
:host => MALL_HOST, :path => MALL_CATEGORY_PATH
MALL_CATEGORY_TEMPLATE = Addressable::Template.new MALL_CATEGORY_URI
def spider_mall!
# Load the mall HTML, scan it for category onclicks
items = {}
spider_request(MALL_MAIN_URI).scan(MALL_CATEGORY_TRIGGER) do |match|
# Plug the category ID into the URI for that category's JSON document
uri = MALL_CATEGORY_TEMPLATE.expand :cat => match[0]
begin
# Load up that JSON and send it off to be parsed
puts "Loading #{uri}..."
category_items = spider_mall_category(spider_request(uri))
puts "...found #{category_items.size} items"
items.merge!(category_items)
rescue SpiderJSONError => e
# If there was a parsing error, add where it came from
Rails.logger.warn "Error parsing JSON at #{uri}, skipping: #{e.message}"
end
end
puts "#{items.size} items total"
# Remove items from the list that already exist, so as to avoid
# unnecessary saves
existing_item_ids = Item.find_all_by_id(items.keys, :select => :id).map(&:id)
items = items.except *existing_item_ids
puts "#{items.size} new items"
items.each do |item_id, item|
item.save
puts "Saved #{item.name} (#{item_id})"
end
items
end
private
def spider_mall_category(json)
begin
items_data = JSON.parse(json)[MALL_JSON_ITEM_DATA_KEY]
unless items_data
raise SpiderJSONError, "Missing key #{MALL_JSON_ITEM_DATA_KEY}"
end
rescue Exception => e
# Catch both errors parsing JSON and the missing key
raise SpiderJSONError, e.message
end
items = {}
items_data.each do |item_id, item_data|
if item_data['isWearable'] == 1
relevant_item_data = item_data.slice('name', 'description', 'price')
item = Item.new relevant_item_data
item.id = item_data['id']
item.thumbnail_url = sprintf(MALL_ITEM_URL_TEMPLATE, item_data['imageFile'])
item.sold_in_mall = true
items[item.id] = item
end
end
items
end
def spider_request(uri)
begin
response = Net::HTTP.get_response uri
rescue SocketError => e
raise SpiderHTTPError, "Error loading #{uri}: #{e.message}"
end
unless response.is_a? Net::HTTPOK
raise SpiderHTTPError, "Error loading #{uri}: Response was a #{response.class}"
end
response.body
end
class SpiderError < RuntimeError;end
class SpiderHTTPError < SpiderError;end
class SpiderJSONError < SpiderError;end
end
private private
SearchFilterScopes = [] SearchFilterScopes = []

View file

@ -0,0 +1,9 @@
class TreatSoldInMallAsABoolean < ActiveRecord::Migration
def self.up
change_column :objects, :sold_in_mall, :boolean, :null => false
end
def self.down
change_column :objects, :sold_in_mall, :integer, :limit => 1, :null => false
end
end

View file

@ -0,0 +1,17 @@
class AllowNullForSomeObjectsFields < ActiveRecord::Migration
def self.up
change_column :objects, :category, :string, :limit => 50, :null => true
change_column :objects, :type, :string, :limit => 50, :null => true
change_column :objects, :rarity, :string, :limit => 25, :null => true
change_column :objects, :rarity_index, :integer, :limit => 2, :null => true
change_column :objects, :weight_lbs, :integer, :limit => 2, :null => true
end
def self.down
change_column :objects, :category, :string, :limit => 50, :null => false
change_column :objects, :type, :string, :limit => 50, :null => false
change_column :objects, :rarity, :string, :limit => 25, :null => false
change_column :objects, :rarity_index, :integer, :limit => 2, :null => false
change_column :objects, :weight_lbs, :integer, :limit => 2, :null => false
end
end

View file

@ -0,0 +1,9 @@
class AddSwfAssetsTypeAndIdIndex < ActiveRecord::Migration
def self.up
add_index "swf_assets", ["type", "id"], :name => "swf_assets_type_and_id"
end
def self.down
remove_index "swf_assets", :name => "swf_assets_type_and_id"
end
end

View file

@ -10,7 +10,7 @@
# #
# It's strongly recommended to check this file into your version control system. # It's strongly recommended to check this file into your version control system.
ActiveRecord::Schema.define(:version => 20101110213044) do ActiveRecord::Schema.define(:version => 20101125160843) do
create_table "auth_servers", :force => true do |t| create_table "auth_servers", :force => true do |t|
t.string "short_name", :limit => 10, :null => false t.string "short_name", :limit => 10, :null => false
@ -49,14 +49,14 @@ ActiveRecord::Schema.define(:version => 20101110213044) do
t.text "thumbnail_url", :null => false t.text "thumbnail_url", :null => false
t.string "name", :limit => 100, :null => false t.string "name", :limit => 100, :null => false
t.text "description", :null => false t.text "description", :null => false
t.string "category", :limit => 50, :null => false t.string "category", :limit => 50
t.string "type", :limit => 50, :null => false t.string "type", :limit => 50
t.string "rarity", :limit => 25, :null => false t.string "rarity", :limit => 25
t.integer "rarity_index", :limit => 2, :null => false t.integer "rarity_index", :limit => 2
t.integer "price", :limit => 3, :null => false t.integer "price", :limit => 3, :null => false
t.integer "weight_lbs", :limit => 2, :null => false t.integer "weight_lbs", :limit => 2
t.text "species_support_ids" t.text "species_support_ids"
t.integer "sold_in_mall", :limit => 1, :null => false t.boolean "sold_in_mall", :null => false
t.datetime "last_spidered" t.datetime "last_spidered"
end end
@ -127,8 +127,8 @@ ActiveRecord::Schema.define(:version => 20101110213044) do
t.integer "body_id", :limit => 2, :null => false t.integer "body_id", :limit => 2, :null => false
end end
add_index "swf_assets", ["type", "id"], :name => "swf_assets_type_and_id"
add_index "swf_assets", ["body_id"], :name => "swf_assets_body_id_and_object_id" add_index "swf_assets", ["body_id"], :name => "swf_assets_body_id_and_object_id"
add_index "swf_assets", ["type", "id"], :name => "swf_assets_type_and_id"
add_index "swf_assets", ["zone_id"], :name => "idx_swf_assets_zone_id" add_index "swf_assets", ["zone_id"], :name => "idx_swf_assets_zone_id"
create_table "users", :force => true do |t| create_table "users", :force => true do |t|
@ -136,6 +136,7 @@ ActiveRecord::Schema.define(:version => 20101110213044) do
t.integer "auth_server_id", :limit => 1, :null => false t.integer "auth_server_id", :limit => 1, :null => false
t.integer "remote_id", :null => false t.integer "remote_id", :null => false
t.integer "points", :default => 0, :null => false t.integer "points", :default => 0, :null => false
t.boolean "beta", :default => false, :null => false
end end
create_table "zones", :force => true do |t| create_table "zones", :force => true do |t|

View file

@ -0,0 +1,6 @@
namespace :items do
desc "Spider NC Mall for wearable items, and store them for later asset spidering"
task :spider_mall => :environment do
Item.spider_mall!
end
end