Compare commits

...

4 commits

Author SHA1 Message Date
46d3325144 Load *all* NC Mall pages in nc_mall:sync
Ta da! Now I can run this and pull 481 records into our database, and
then turn around and run it again and have them all correctly say
"skipped"!
2024-05-10 17:39:40 -07:00
b6e18e10a5 Add bare-bones rails nc_mall:sync task, incl. NCMallRecord model
Currently we only load the homepage, so there's only actually one
wearable item to sync up! But here's the task to do it!

To do this, we also created the backing model NCMallRecord, where we'll
save the current NC Mall state!
2024-05-07 17:40:14 -07:00
1f157b49da Load additional pages via NC Mall scraper service
This is for URLs like this! https://ncmall.neopets.com/mall/ajax/load_page.phtml?type=browse&cat=43&lang=en
2024-05-07 17:38:48 -07:00
7b0b6b70d2 Initial NC Mall scraper service
This doesn't connect to anything yet, I'm just doing the beginnings of
loading NC Mall item data!

My intent is to run this regularly to keep our own NC info in the
database too, primarily for use in the Item Getting Guide. (Could be
useful to surface in other places too though!) This will help us split
items into those that can be one-click purchased with the NC Mall
integration, vs NC items that need to be acquired by other means.
2024-05-07 16:06:37 -07:00
7 changed files with 260 additions and 1 deletions

View file

@ -0,0 +1,3 @@
class NCMallRecord < ApplicationRecord
belongs_to :item
end

116
app/services/nc_mall.rb Normal file
View file

@ -0,0 +1,116 @@
require "addressable/template"
require "async/http/internet/instance"
module NCMall
# Share a pool of persistent connections, rather than reconnecting on
# each request. (This library does that automatically!)
INTERNET = Async::HTTP::Internet.instance
# Load the NC Mall home page content area, and return its useful data.
HOME_PAGE_URL = "https://ncmall.neopets.com/mall/ajax/home_page.phtml"
def self.load_home_page
load_page_by_url HOME_PAGE_URL
end
# Load the NC Mall page for a specific type and category ID.
CATEGORY_PAGE_URL_TEMPLATE = Addressable::Template.new(
"https://ncmall.neopets.com/mall/ajax/load_page.phtml?lang=en{&type,cat}"
)
def self.load_page(type, cat)
load_page_by_url CATEGORY_PAGE_URL_TEMPLATE.expand(type:, cat:)
end
# Load the NC Mall root document HTML, and extract the list of links to
# other pages ("New", "Popular", etc.)
ROOT_DOCUMENT_URL = "https://ncmall.neopets.com/mall/shop.phtml"
PAGE_LINK_PATTERN = /load_items_pane\(['"](.+?)['"], ([0-9]+)\).+?>(.+?)</
def self.load_page_links
Sync do
response = INTERNET.get(ROOT_DOCUMENT_URL, [
["User-Agent", Rails.configuration.user_agent_for_neopets],
])
if response.status != 200
raise ResponseNotOK.new(response.status),
"expected status 200 but got #{response.status} (#{url})"
end
# Extract `load_items_pane` calls from the root document's HTML. (We use
# a very simplified regex, rather than actually parsing the full HTML!)
html = response.read
html.scan(PAGE_LINK_PATTERN).
map { |type, cat, label| {type:, cat:, label:} }.
uniq
end
end
private
def self.load_page_by_url(url)
Sync do
response = INTERNET.get(url, [
["User-Agent", Rails.configuration.user_agent_for_neopets],
])
if response.status != 200
raise ResponseNotOK.new(response.status),
"expected status 200 but got #{response.status} (#{url})"
end
parse_nc_page response.read
end
end
# Given a string of NC page data, parse the useful data out of it!
def self.parse_nc_page(nc_page_str)
begin
nc_page = JSON.parse(nc_page_str)
rescue JSON::ParserError
Rails.logger.debug "Unexpected NC page response:\n#{nc_page_str}"
raise UnexpectedResponseFormat,
"failed to parse NC page response as JSON"
end
unless nc_page.has_key? "object_data"
raise UnexpectedResponseFormat, "missing field object_data in NC page"
end
# NOTE: When there's no object data, it will be an empty array instead of
# an empty hash. Weird API thing to work around!
nc_page["object_data"] = {} if nc_page["object_data"] == []
items = nc_page["object_data"].values.map do |item_info|
{
id: item_info["id"],
name: item_info["name"],
description: item_info["description"],
price: item_info["price"],
discount: parse_item_discount(item_info),
is_available: item_info["isAvailable"] == 1,
}
end
{items:}
end
# Given item info, return a hash of discount-specific info, if any.
def self.parse_item_discount(item_info)
discount_price = item_info["discountPrice"]
return nil unless discount_price.present? && discount_price > 0
{
price: discount_price,
begins_at: item_info["discountBegin"],
ends_at: item_info["discountEnd"],
}
end
class ResponseNotOK < StandardError
attr_reader :status
def initialize(status)
super
@status = status
end
end
class UnexpectedResponseFormat < StandardError;end
end

View file

@ -21,4 +21,8 @@ ActiveSupport::Inflector.inflections(:en) do |inflect|
# Teach Zeitwerk that `NeoPass` is what to expect in `app/services/neopass.rb`.
inflect.acronym "NeoPass"
# Teach Zeitwerk that "NCMall" is what to expect in `app/services/nc_mall.rb`.
# (We do this by teaching it the word "NC".)
inflect.acronym "NC"
end

View file

@ -0,0 +1,13 @@
class CreateNCMallRecords < ActiveRecord::Migration[7.1]
def change
create_table :nc_mall_records do |t|
t.references :item, type: :integer, null: false, foreign_key: true
t.integer :price, null: false
t.integer :discount_price
t.datetime :discount_begins_at
t.datetime :discount_ends_at
t.timestamps
end
end
end

View file

@ -0,0 +1,11 @@
class AddUniqueIndexOnItemIdToNCMallRecords < ActiveRecord::Migration[7.1]
def change
# NOTE: We need to temporarily remove the foreign key, then add it back
# once the index is in.
remove_foreign_key :nc_mall_records, :items
remove_index :nc_mall_records, :item_id
add_index :nc_mall_records, :item_id, unique: true
add_foreign_key :nc_mall_records, :items
end
end

View file

@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema[7.1].define(version: 2024_05_02_195157) do
ActiveRecord::Schema[7.1].define(version: 2024_05_11_003019) do
create_table "alt_styles", charset: "utf8mb4", collation: "utf8mb4_unicode_520_ci", force: :cascade do |t|
t.integer "species_id", null: false
t.integer "color_id", null: false
@ -154,6 +154,17 @@ ActiveRecord::Schema[7.1].define(version: 2024_05_02_195157) do
t.string "pet_name", limit: 128, null: false
end
create_table "nc_mall_records", charset: "utf8mb4", collation: "utf8mb4_general_ci", force: :cascade do |t|
t.integer "item_id", null: false
t.integer "price", null: false
t.integer "discount_price"
t.datetime "discount_begins_at"
t.datetime "discount_ends_at"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["item_id"], name: "index_nc_mall_records_on_item_id", unique: true
end
create_table "neopets_connections", id: :integer, charset: "utf8mb4", collation: "utf8mb4_unicode_520_ci", force: :cascade do |t|
t.integer "user_id"
t.string "neopets_username"
@ -278,5 +289,6 @@ ActiveRecord::Schema[7.1].define(version: 2024_05_02_195157) do
add_foreign_key "alt_styles", "colors"
add_foreign_key "alt_styles", "species"
add_foreign_key "nc_mall_records", "items"
add_foreign_key "outfits", "alt_styles"
end

100
lib/tasks/nc_mall.rake Normal file
View file

@ -0,0 +1,100 @@
namespace :nc_mall do
desc "Sync our NCMallRecord table with the live NC Mall"
task :sync => :environment do
# Log to STDOUT.
Rails.logger = Logger.new(STDOUT)
# First, load all records of what's being sold in the live NC Mall. We load
# the homepage and all pages linked from the main document, and extract the
# items from each. (We also de-duplicate the items, which is important
# because the algorithm expects to only process each item once!)
pages = load_all_nc_mall_pages
live_item_records = pages.map { |p| p[:items] }.flatten.uniq
# Then, get the existing NC Mall records in our database. (We include the
# items, to be able to output the item name during logging.)
existing_records = NCMallRecord.includes(:item).all
existing_records_by_item_id = existing_records.to_h { |r| [r.item_id, r] }
# Additionally, check which of the item IDs in the live records are items
# we've seen before. (We'll skip records for items we don't know.)
live_item_ids = live_item_records.map { |r| r[:id] }
recognized_item_ids = Item.where(id: live_item_ids).pluck(:id).to_set
Rails.logger.debug "We found #{live_item_records.size} items, and we " +
"recognize #{recognized_item_ids.size} of them."
# For each record in the live NC Mall, check if there's an existing record.
# If so, update it, and remove it from the existing records hash. If not,
# create it.
live_item_records.each do |record_data|
# If we don't recognize this item ID in our database already, skip it.
next unless recognized_item_ids.include?(record_data[:id])
record = existing_records_by_item_id.delete(record_data[:id]) ||
NCMallRecord.new
record.item_id = record_data[:id]
record.price = record_data[:price]
record.discount_price = record_data.dig(:discount, :price)
record.discount_begins_at = record_data.dig(:discount, :begins_at)
record.discount_ends_at = record_data.dig(:discount, :ends_at)
if !record.changed?
Rails.logger.info "Skipping record for item #{record_data[:name]} " +
"(unchanged)"
next
end
if record.save
if record.previously_new_record?
Rails.logger.info "Created record for item #{record_data[:name]}"
else
Rails.logger.info "Updated record for item #{record_data[:name]}"
end
else
Rails.logger.error "Failed to save record for item " +
"#{record_data[:name]}: " +
"#{record.errors.full_messages.join("; ")}: " +
"#{record.inspect}"
end
end
# For each existing record remaining in the existing records hash, this
# means there was no live record corresponding to it during this sync.
# Delete it!
existing_records_by_item_id.values.each do |record|
item_name = record.item&.name || "<item not found>"
if record.destroy
Rails.logger.info "Destroyed record #{record.id} for item " +
"#{item_name}"
else
Rails.logger.error "Failed to destroy record #{record.id} for " +
"item #{item_name}: #{record.inspect}"
end
end
end
end
def load_all_nc_mall_pages
Sync do
# First, start loading the homepage.
homepage_task = Async { NCMall.load_home_page }
# Next, load the page links for different categories etc.
links = NCMall.load_page_links
# Next, load the linked pages, 10 at a time.
barrier = Async::Barrier.new
semaphore = Async::Semaphore.new(10, parent: barrier)
begin
linked_page_tasks = links.map do |link|
semaphore.async { NCMall.load_page link[:type], link[:cat] }
end
barrier.wait # Load all the pages.
ensure
barrier.stop # If any pages failed, cancel the rest.
end
# Finally, return all the pages: the homepage, and the linked pages.
[homepage_task.wait] + linked_page_tasks.map(&:wait)
end
end