impress/app/services/nc_mall.rb
Matchu 46d3325144 Load *all* NC Mall pages in nc_mall:sync
Ta da! Now I can run this and pull 481 records into our database, and
then turn around and run it again and have them all correctly say
"skipped"!
2024-05-10 17:39:40 -07:00

116 lines
3.4 KiB
Ruby

require "addressable/template"
require "async/http/internet/instance"
module NCMall
# Share a pool of persistent connections, rather than reconnecting on
# each request. (This library does that automatically!)
INTERNET = Async::HTTP::Internet.instance
# Load the NC Mall home page content area, and return its useful data.
HOME_PAGE_URL = "https://ncmall.neopets.com/mall/ajax/home_page.phtml"
def self.load_home_page
load_page_by_url HOME_PAGE_URL
end
# Load the NC Mall page for a specific type and category ID.
CATEGORY_PAGE_URL_TEMPLATE = Addressable::Template.new(
"https://ncmall.neopets.com/mall/ajax/load_page.phtml?lang=en{&type,cat}"
)
def self.load_page(type, cat)
load_page_by_url CATEGORY_PAGE_URL_TEMPLATE.expand(type:, cat:)
end
# Load the NC Mall root document HTML, and extract the list of links to
# other pages ("New", "Popular", etc.)
ROOT_DOCUMENT_URL = "https://ncmall.neopets.com/mall/shop.phtml"
PAGE_LINK_PATTERN = /load_items_pane\(['"](.+?)['"], ([0-9]+)\).+?>(.+?)</
def self.load_page_links
Sync do
response = INTERNET.get(ROOT_DOCUMENT_URL, [
["User-Agent", Rails.configuration.user_agent_for_neopets],
])
if response.status != 200
raise ResponseNotOK.new(response.status),
"expected status 200 but got #{response.status} (#{url})"
end
# Extract `load_items_pane` calls from the root document's HTML. (We use
# a very simplified regex, rather than actually parsing the full HTML!)
html = response.read
html.scan(PAGE_LINK_PATTERN).
map { |type, cat, label| {type:, cat:, label:} }.
uniq
end
end
private
def self.load_page_by_url(url)
Sync do
response = INTERNET.get(url, [
["User-Agent", Rails.configuration.user_agent_for_neopets],
])
if response.status != 200
raise ResponseNotOK.new(response.status),
"expected status 200 but got #{response.status} (#{url})"
end
parse_nc_page response.read
end
end
# Given a string of NC page data, parse the useful data out of it!
def self.parse_nc_page(nc_page_str)
begin
nc_page = JSON.parse(nc_page_str)
rescue JSON::ParserError
Rails.logger.debug "Unexpected NC page response:\n#{nc_page_str}"
raise UnexpectedResponseFormat,
"failed to parse NC page response as JSON"
end
unless nc_page.has_key? "object_data"
raise UnexpectedResponseFormat, "missing field object_data in NC page"
end
# NOTE: When there's no object data, it will be an empty array instead of
# an empty hash. Weird API thing to work around!
nc_page["object_data"] = {} if nc_page["object_data"] == []
items = nc_page["object_data"].values.map do |item_info|
{
id: item_info["id"],
name: item_info["name"],
description: item_info["description"],
price: item_info["price"],
discount: parse_item_discount(item_info),
is_available: item_info["isAvailable"] == 1,
}
end
{items:}
end
# Given item info, return a hash of discount-specific info, if any.
def self.parse_item_discount(item_info)
discount_price = item_info["discountPrice"]
return nil unless discount_price.present? && discount_price > 0
{
price: discount_price,
begins_at: item_info["discountBegin"],
ends_at: item_info["discountEnd"],
}
end
class ResponseNotOK < StandardError
attr_reader :status
def initialize(status)
super
@status = status
end
end
class UnexpectedResponseFormat < StandardError;end
end