Update NC Mall scraping for new redesign
First actual feature I'm letting Claude run! We worked the exploration of the updated API together, then it ran with the implementation. I left this hanging for a long time.... good to finally have it updated!
This commit is contained in:
parent
b1f06029f8
commit
3582229b47
3 changed files with 209 additions and 71 deletions
|
|
@ -1,25 +1,74 @@
|
|||
require "addressable/template"
|
||||
|
||||
# Neopets::NCMall integrates with the Neopets NC Mall to fetch currently
|
||||
# available items and their pricing.
|
||||
#
|
||||
# The integration works in two steps:
|
||||
#
|
||||
# 1. Category Discovery: We fetch the NC Mall homepage and extract the
|
||||
# browsable categories from the embedded `window.ncmall_menu` JSON data.
|
||||
# We filter out special feature categories (those with external URLs) and
|
||||
# structural parent nodes (those without a cat_id).
|
||||
#
|
||||
# 2. Item Fetching: For each category, we call the v2 category API with
|
||||
# pagination support. Large categories may span multiple pages, which we
|
||||
# fetch in parallel and combine. Items can appear in multiple categories,
|
||||
# so the rake task de-duplicates by item ID.
|
||||
#
|
||||
# The parsed item data includes:
|
||||
# - id: Neopets item ID
|
||||
# - name: Item display name
|
||||
# - description: Item description
|
||||
# - price: Regular price in NC (NeoCash)
|
||||
# - discount: Optional discount info (price, begins_at, ends_at)
|
||||
# - is_available: Whether the item is currently purchasable
|
||||
#
|
||||
# This module is used by the `neopets:import:nc_mall` rake task to sync our
|
||||
# NCMallRecord table with the live NC Mall.
|
||||
module Neopets::NCMall
|
||||
# Load the NC Mall home page content area, and return its useful data.
|
||||
HOME_PAGE_URL = "https://ncmall.neopets.com/mall/ajax/home_page.phtml"
|
||||
def self.load_home_page
|
||||
load_page_by_url HOME_PAGE_URL
|
||||
end
|
||||
|
||||
# Load the NC Mall page for a specific type and category ID.
|
||||
# Load the NC Mall page for a specific type and category ID, with pagination.
|
||||
CATEGORY_PAGE_URL_TEMPLATE = Addressable::Template.new(
|
||||
"https://ncmall.neopets.com/mall/ajax/load_page.phtml?lang=en{&type,cat}"
|
||||
"https://ncmall.neopets.com/mall/ajax/v2/category/index.phtml{?type,cat,page,limit}"
|
||||
)
|
||||
def self.load_page(type, cat)
|
||||
load_page_by_url CATEGORY_PAGE_URL_TEMPLATE.expand(type:, cat:)
|
||||
def self.load_page(type, cat, page: 1, limit: 24)
|
||||
url = CATEGORY_PAGE_URL_TEMPLATE.expand(type:, cat:, page:, limit:)
|
||||
Sync do
|
||||
DTIRequests.get(url) do |response|
|
||||
if response.status != 200
|
||||
raise ResponseNotOK.new(response.status),
|
||||
"expected status 200 but got #{response.status} (#{url})"
|
||||
end
|
||||
|
||||
parse_nc_page response.read
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Load the NC Mall root document HTML, and extract the list of links to
|
||||
# other pages ("New", "Popular", etc.)
|
||||
# Load all pages for a specific category.
|
||||
def self.load_category_all_pages(type, cat, limit: 24)
|
||||
# First, load page 1 to get total page count
|
||||
first_page = load_page(type, cat, page: 1, limit:)
|
||||
total_pages = first_page[:total_pages]
|
||||
|
||||
# If there's only one page, return it
|
||||
return first_page[:items] if total_pages <= 1
|
||||
|
||||
# Otherwise, load remaining pages in parallel
|
||||
Sync do
|
||||
remaining_page_tasks = (2..total_pages).map do |page_num|
|
||||
Async { load_page(type, cat, page: page_num, limit:) }
|
||||
end
|
||||
|
||||
all_pages = [first_page] + remaining_page_tasks.map(&:wait)
|
||||
all_pages.flat_map { |page| page[:items] }
|
||||
end
|
||||
end
|
||||
|
||||
# Load the NC Mall root document HTML, and extract categories from the
|
||||
# embedded menu JSON.
|
||||
ROOT_DOCUMENT_URL = "https://ncmall.neopets.com/mall/shop.phtml"
|
||||
PAGE_LINK_PATTERN = /load_items_pane\(['"](.+?)['"], ([0-9]+)\).+?>(.+?)</
|
||||
def self.load_page_links
|
||||
MENU_JSON_PATTERN = /window\.ncmall_menu = (\[.*?\]);/m
|
||||
def self.load_categories
|
||||
html = Sync do
|
||||
DTIRequests.get(ROOT_DOCUMENT_URL) do |response|
|
||||
if response.status != 200
|
||||
|
|
@ -31,11 +80,34 @@ module Neopets::NCMall
|
|||
end
|
||||
end
|
||||
|
||||
# Extract `load_items_pane` calls from the root document's HTML. (We use
|
||||
# a very simplified regex, rather than actually parsing the full HTML!)
|
||||
html.scan(PAGE_LINK_PATTERN).
|
||||
map { |type, cat, label| {type:, cat:, label:} }.
|
||||
uniq
|
||||
# Extract the ncmall_menu JSON from the script tag
|
||||
match = html.match(MENU_JSON_PATTERN)
|
||||
unless match
|
||||
raise UnexpectedResponseFormat,
|
||||
"could not find window.ncmall_menu in homepage HTML"
|
||||
end
|
||||
|
||||
begin
|
||||
menu = JSON.parse(match[1])
|
||||
rescue JSON::ParserError => e
|
||||
Rails.logger.debug "Failed to parse ncmall_menu JSON: #{e.message}"
|
||||
raise UnexpectedResponseFormat,
|
||||
"failed to parse ncmall_menu as JSON"
|
||||
end
|
||||
|
||||
# Flatten the menu structure, and filter to browsable categories
|
||||
browsable_categories = flatten_categories(menu).
|
||||
# Skip categories without a cat_id (structural parent nodes)
|
||||
reject { |cat| cat['cat_id'].blank? }.
|
||||
# Skip categories with external URLs (special features)
|
||||
reject { |cat| cat['url'].present? }
|
||||
|
||||
# Map each category to include the API type (and remove load_type)
|
||||
browsable_categories.map do |cat|
|
||||
cat.except("load_type").merge(
|
||||
"type" => map_load_type_to_api_type(cat["load_type"])
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
def self.load_styles(species_id:, neologin:)
|
||||
|
|
@ -50,6 +122,26 @@ module Neopets::NCMall
|
|||
|
||||
private
|
||||
|
||||
# Map load_type from menu JSON to the v2 API type parameter.
|
||||
def self.map_load_type_to_api_type(load_type)
|
||||
case load_type
|
||||
when "new"
|
||||
"new_items"
|
||||
when "popular"
|
||||
"popular_items"
|
||||
else
|
||||
"browse"
|
||||
end
|
||||
end
|
||||
|
||||
# Flatten nested category structure (handles children arrays)
|
||||
def self.flatten_categories(menu)
|
||||
menu.flat_map do |cat|
|
||||
children = cat["children"] || []
|
||||
[cat] + flatten_categories(children)
|
||||
end
|
||||
end
|
||||
|
||||
STYLING_STUDIO_URL = "https://www.neopets.com/np-templates/ajax/stylingstudio/studio.php"
|
||||
def self.load_styles_tab(species_id:, neologin:, tab:)
|
||||
Sync do
|
||||
|
|
@ -81,20 +173,7 @@ module Neopets::NCMall
|
|||
end
|
||||
end
|
||||
|
||||
def self.load_page_by_url(url)
|
||||
Sync do
|
||||
DTIRequests.get(url) do |response|
|
||||
if response.status != 200
|
||||
raise ResponseNotOK.new(response.status),
|
||||
"expected status 200 but got #{response.status} (#{url})"
|
||||
end
|
||||
|
||||
parse_nc_page response.read
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Given a string of NC page data, parse the useful data out of it!
|
||||
# Given a string of v2 NC page data, parse the useful data out of it!
|
||||
def self.parse_nc_page(nc_page_str)
|
||||
begin
|
||||
nc_page = JSON.parse(nc_page_str)
|
||||
|
|
@ -104,24 +183,14 @@ module Neopets::NCMall
|
|||
"failed to parse NC page response as JSON"
|
||||
end
|
||||
|
||||
unless nc_page.has_key? "object_data"
|
||||
raise UnexpectedResponseFormat, "missing field object_data in NC page"
|
||||
# v2 API returns items in a "data" array
|
||||
unless nc_page.has_key? "data"
|
||||
raise UnexpectedResponseFormat, "missing field data in v2 NC page"
|
||||
end
|
||||
|
||||
object_data = nc_page["object_data"]
|
||||
item_data = nc_page["data"] || []
|
||||
|
||||
# NOTE: When there's no object data, it will be an empty array instead of
|
||||
# an empty hash. Weird API thing to work around!
|
||||
object_data = {} if object_data == []
|
||||
|
||||
# Only the items in the `render` list are actually listed as directly for
|
||||
# sale in the shop. `object_data` might contain other items that provide
|
||||
# supporting information about them, but aren't actually for sale.
|
||||
visible_object_data = (nc_page["render"] || []).
|
||||
map { |id| object_data[id.to_s] }.
|
||||
filter(&:present?)
|
||||
|
||||
items = visible_object_data.map do |item_info|
|
||||
items = item_data.map do |item_info|
|
||||
{
|
||||
id: item_info["id"],
|
||||
name: item_info["name"],
|
||||
|
|
@ -132,7 +201,12 @@ module Neopets::NCMall
|
|||
}
|
||||
end
|
||||
|
||||
{items:}
|
||||
{
|
||||
items:,
|
||||
total_pages: nc_page["totalPages"].to_i,
|
||||
page: nc_page["page"].to_i,
|
||||
limit: nc_page["limit"].to_i,
|
||||
}
|
||||
end
|
||||
|
||||
# Given item info, return a hash of discount-specific info, if any.
|
||||
|
|
|
|||
|
|
@ -7,11 +7,10 @@ namespace "neopets:import" do
|
|||
puts "Importing from NC Mall…"
|
||||
|
||||
# First, load all records of what's being sold in the live NC Mall. We load
|
||||
# the homepage and all pages linked from the main document, and extract the
|
||||
# items from each. (We also de-duplicate the items, which is important
|
||||
# because the algorithm expects to only process each item once!)
|
||||
pages = load_all_nc_mall_pages
|
||||
live_item_records = pages.map { |p| p[:items] }.flatten.uniq
|
||||
# all categories from the menu and fetch all items from each. (We also
|
||||
# de-duplicate the items, which is important because the same item can
|
||||
# appear in multiple categories!)
|
||||
live_item_records = load_all_nc_mall_items.uniq { |item| item[:id] }
|
||||
|
||||
# Then, get the existing NC Mall records in our database. (We include the
|
||||
# items, to be able to output the item name during logging.)
|
||||
|
|
@ -76,22 +75,28 @@ namespace "neopets:import" do
|
|||
end
|
||||
end
|
||||
|
||||
def load_all_nc_mall_pages
|
||||
def load_all_nc_mall_items
|
||||
Sync do
|
||||
# First, start loading the homepage.
|
||||
homepage_task = Async { Neopets::NCMall.load_home_page }
|
||||
# Load all categories from the menu JSON
|
||||
categories = Neopets::NCMall.load_categories
|
||||
|
||||
# Next, load the page links for different categories etc.
|
||||
links = Neopets::NCMall.load_page_links
|
||||
# Load all pages for each category, 10 categories at a time
|
||||
category_item_tasks = DTIRequests.load_many(max_at_once: 10) do |task|
|
||||
categories.map do |category|
|
||||
task.async do
|
||||
type = category["type"]
|
||||
cat_id = category["cat_id"]
|
||||
|
||||
# Next, load the linked pages, 10 at a time.
|
||||
linked_page_tasks = DTIRequests.load_many(max_at_once: 10) do |task|
|
||||
links.map do |link|
|
||||
task.async { Neopets::NCMall.load_page link[:type], link[:cat] }
|
||||
Rails.logger.debug "Loading category: #{category["cat_name"]} " +
|
||||
"(type=#{type}, cat=#{cat_id})"
|
||||
|
||||
Neopets::NCMall.load_category_all_pages(type, cat_id)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Finally, return all the pages: the homepage, and the linked pages.
|
||||
[homepage_task.wait] + linked_page_tasks.map(&:wait)
|
||||
# Flatten all items from all categories and return as a single array
|
||||
# (We'll de-duplicate in the main task)
|
||||
category_item_tasks.map(&:wait).flatten
|
||||
end
|
||||
end
|
||||
|
|
|
|||
|
|
@ -3,8 +3,8 @@ require_relative '../rails_helper'
|
|||
|
||||
RSpec.describe Neopets::NCMall, type: :model do
|
||||
describe ".load_page" do
|
||||
def stub_page_request
|
||||
stub_request(:get, "https://ncmall.neopets.com/mall/ajax/load_page.phtml?type=new&cat=52&lang=en").
|
||||
def stub_v2_page_request(page: 1)
|
||||
stub_request(:get, "https://ncmall.neopets.com/mall/ajax/v2/category/index.phtml?type=new_items&cat=52&page=#{page}&limit=24").
|
||||
with(
|
||||
headers: {
|
||||
"User-Agent": Rails.configuration.user_agent_for_neopets,
|
||||
|
|
@ -13,12 +13,12 @@ RSpec.describe Neopets::NCMall, type: :model do
|
|||
end
|
||||
|
||||
subject(:page) do
|
||||
Neopets::NCMall.load_page("new", 52)
|
||||
Neopets::NCMall.load_page("new_items", 52, page: 1, limit: 24)
|
||||
end
|
||||
|
||||
it "loads a page from the NC Mall" do
|
||||
stub_page_request.to_return(
|
||||
body: '{"html":"","render_html":"0","render":[82936,90226],"object_data":{"82936":{"id":82936,"name":"+1 Extra Pet Slot","description":"Just ONE more Neopet... just ONE more...! This pack includes 1 extra pet slot. Each extra pet slot can be used to create a new pet, adopt a pet, or bring back any idle pets lost from non-premium accounts.","price":500,"discountPrice":0,"atPurchaseDiscountPrice":null,"discountBegin":1735372800,"discountEnd":1735718399,"uses":1,"isSuperpack":0,"isBundle":0,"packContents":null,"isAvailable":1,"imageFile":"mall_petslots_1","saleBegin":1703094300,"saleEnd":0,"duration":0,"isSoldOut":0,"isNeohome":0,"isWearable":0,"isBuyable":1,"isAlbumTheme":0,"isGiftbox":0,"isInRandomWindow":null,"isElite":0,"isCollectible":0,"isKeyquest":0,"categories":null,"isHabitarium":0,"isNoInvInsert":1,"isLimitedQuantity":0,"isPresale":0,"isGambling":0,"petSlotPack":1,"maxPetSlots":10,"currentUserBoughtPetSlots":0,"formatted":{"name":"+1 Extra Pet Slot","ck":false,"price":"500","discountPrice":"0","limited":false},"converted":true},"90226":{"id":90226,"name":"Weekend Sales 2025 Mega Gram","description":"Lets go shopping! Purchase this Weekend Sales Mega Gram and choose from exclusive Weekend Sales items to send to a Neofriend, no gift box needed! This gram also has a chance of including a Limited Edition NC item. Please visit the NC Mall FAQs for more information on this item.","price":250,"discountPrice":125,"atPurchaseDiscountPrice":null,"discountBegin":1737136800,"discountEnd":1737446399,"uses":1,"isSuperpack":0,"isBundle":0,"packContents":null,"isAvailable":1,"imageFile":"42embjc204","saleBegin":1737136800,"saleEnd":1739865599,"duration":0,"isSoldOut":0,"isNeohome":0,"isWearable":0,"isBuyable":1,"isAlbumTheme":0,"isGiftbox":0,"isInRandomWindow":null,"isElite":0,"isCollectible":0,"isKeyquest":0,"categories":null,"isHabitarium":0,"isNoInvInsert":0,"isLimitedQuantity":0,"isPresale":0,"isGambling":0,"formatted":{"name":"Weekend Sales 2025 Mega Gram","ck":false,"price":"250","discountPrice":"125","limited":false},"converted":true}},"response":{"category":"52","type":"new","image":{"location":"//images.neopets.com/items/","star_location":"//images.neopets.com/ncmall/","extension":".gif","stars":{"blue":"star_blue","red":"star_red","orange":"star_orange","leso":"leso_star"}},"heading":"New","no_items_msg":"","shopkeeper":{"img":"//images.neopets.com/ncmall/shopkeepers/mall_new.jpg","title":"Style is all about what\'s new… good thing that\'s all I stock!","message":"Come browse my shop and find the latest and greatest the NC Mall has to offer!","new_format":true},"strings":{"claim_it":"Claim it","none_left":"Sorry, there are none left!","nc":"NC","free":"FREE","add_to_cart":"Add to cart"}}}'
|
||||
it "loads a page from the v2 NC Mall API" do
|
||||
stub_v2_page_request.to_return(
|
||||
body: '{"html":"","render_html":"0","type":"new_items","data":[{"id":82936,"name":"+1 Extra Pet Slot","description":"Just ONE more Neopet... just ONE more...! This pack includes 1 extra pet slot. Each extra pet slot can be used to create a new pet, adopt a pet, or bring back any idle pets lost from non-premium accounts.","price":500,"discountPrice":0,"atPurchaseDiscountPrice":null,"discountBegin":1735372800,"discountEnd":1735718399,"uses":1,"isSuperpack":0,"isBundle":0,"packContents":null,"isAvailable":1,"imageFile":"mall_petslots_1","saleBegin":1703094300,"saleEnd":0,"duration":0,"isSoldOut":0,"isNeohome":0,"isWearable":0,"isBuyable":1,"isAlbumTheme":0,"isGiftbox":0,"isInRandomWindow":null,"isElite":0,"isCollectible":0,"isKeyquest":0,"categories":null,"isHabitarium":0,"isNoInvInsert":1,"isLimitedQuantity":0,"isPresale":0,"isGambling":0,"petSlotPack":1,"maxPetSlots":10,"currentUserBoughtPetSlots":0,"formatted":{"name":"+1 Extra Pet Slot","ck":false,"price":"500","discountPrice":"0","limited":false},"converted":true},{"id":90226,"name":"Weekend Sales 2025 Mega Gram","description":"Lets go shopping! Purchase this Weekend Sales Mega Gram and choose from exclusive Weekend Sales items to send to a Neofriend, no gift box needed! This gram also has a chance of including a Limited Edition NC item. Please visit the NC Mall FAQs for more information on this item.","price":250,"discountPrice":125,"atPurchaseDiscountPrice":null,"discountBegin":1737136800,"discountEnd":1737446399,"uses":1,"isSuperpack":0,"isBundle":0,"packContents":null,"isAvailable":1,"imageFile":"42embjc204","saleBegin":1737136800,"saleEnd":1739865599,"duration":0,"isSoldOut":0,"isNeohome":0,"isWearable":0,"isBuyable":1,"isAlbumTheme":0,"isGiftbox":0,"isInRandomWindow":null,"isElite":0,"isCollectible":0,"isKeyquest":0,"categories":null,"isHabitarium":0,"isNoInvInsert":0,"isLimitedQuantity":0,"isPresale":0,"isGambling":0,"formatted":{"name":"Weekend Sales 2025 Mega Gram","ck":false,"price":"250","discountPrice":"125","limited":false},"converted":true}],"totalItems":"2","totalPages":"1","page":"1","limit":"24"}'
|
||||
)
|
||||
|
||||
expect(page[:items]).to contain_exactly(
|
||||
|
|
@ -45,6 +45,65 @@ RSpec.describe Neopets::NCMall, type: :model do
|
|||
is_available: true,
|
||||
},
|
||||
)
|
||||
expect(page[:total_pages]).to eq(1)
|
||||
expect(page[:page]).to eq(1)
|
||||
end
|
||||
|
||||
it "handles pagination metadata" do
|
||||
stub_v2_page_request.to_return(
|
||||
body: '{"html":"","render_html":"0","type":"new_items","data":[{"id":82936,"name":"Test Item","description":"Test","price":100,"discountPrice":0,"atPurchaseDiscountPrice":null,"discountBegin":1735372800,"discountEnd":1735718399,"uses":1,"isSuperpack":0,"isBundle":0,"packContents":null,"isAvailable":1,"imageFile":"test","saleBegin":1703094300,"saleEnd":0,"duration":0,"isSoldOut":0,"isNeohome":0,"isWearable":1,"isBuyable":1,"isAlbumTheme":0,"isGiftbox":0,"isInRandomWindow":null,"isElite":0,"isCollectible":0,"isKeyquest":0,"categories":null,"isHabitarium":0,"isNoInvInsert":0,"isLimitedQuantity":0,"isPresale":0,"isGambling":0,"formatted":{"name":"Test Item","ck":false,"price":"100","discountPrice":"0","limited":false},"converted":true}],"totalItems":"50","totalPages":"3","page":"1","limit":"24"}'
|
||||
)
|
||||
|
||||
expect(page[:total_pages]).to eq(3)
|
||||
expect(page[:page]).to eq(1)
|
||||
expect(page[:limit]).to eq(24)
|
||||
end
|
||||
end
|
||||
|
||||
describe ".load_categories" do
|
||||
def stub_homepage_request
|
||||
stub_request(:get, "https://ncmall.neopets.com/mall/shop.phtml").
|
||||
with(
|
||||
headers: {
|
||||
"User-Agent": Rails.configuration.user_agent_for_neopets,
|
||||
},
|
||||
)
|
||||
end
|
||||
|
||||
subject(:categories) do
|
||||
Neopets::NCMall.load_categories
|
||||
end
|
||||
|
||||
it "extracts browsable categories from menu JSON and maps load types" do
|
||||
stub_homepage_request.to_return(
|
||||
body: '<html><head><script>window.ncmall_menu = [{"cat_id":52,"cat_name":"New","load_type":"new"},{"cat_id":54,"cat_name":"Popular","load_type":"popular"},{"cat_id":42,"cat_name":"Customization","load_type":"neopet","children":[{"cat_id":43,"cat_name":"Clothing","parent_id":42},{"cat_id":44,"cat_name":"Shoes","parent_id":42}]},{"cat_name":"Specialty","children":[{"cat_id":85,"cat_name":"NC Collectible","load_type":"collectible","url":"https://www.neopets.com/mall/nc_collectible_case.phtml"},{"cat_id":13,"cat_name":"Elite Boutique","url":"https://ncmall.neopets.com/mall/shop.phtml?page=&cat=13"}]}];</script></head></html>'
|
||||
)
|
||||
|
||||
expect(categories).to contain_exactly(
|
||||
hash_including("cat_id" => 52, "cat_name" => "New", "type" => "new_items"),
|
||||
hash_including("cat_id" => 54, "cat_name" => "Popular", "type" => "popular_items"),
|
||||
hash_including("cat_id" => 42, "cat_name" => "Customization", "type" => "browse"),
|
||||
hash_including("cat_id" => 43, "cat_name" => "Clothing", "parent_id" => 42, "type" => "browse"),
|
||||
hash_including("cat_id" => 44, "cat_name" => "Shoes", "parent_id" => 42, "type" => "browse"),
|
||||
)
|
||||
|
||||
# Should NOT include load_type field (it's been converted to type)
|
||||
categories.each do |cat|
|
||||
expect(cat).not_to have_key("load_type")
|
||||
end
|
||||
|
||||
# Should NOT include categories with external URLs
|
||||
expect(categories).not_to include(
|
||||
hash_including("cat_name" => "NC Collectible"),
|
||||
)
|
||||
expect(categories).not_to include(
|
||||
hash_including("cat_name" => "Elite Boutique"),
|
||||
)
|
||||
|
||||
# Should NOT include structural parent without cat_id
|
||||
expect(categories).not_to include(
|
||||
hash_including("cat_name" => "Specialty"),
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue