Simpler, faster modeling query

I narrowed down the problem to the fact that we were joining in pet types against assets, and *then* running GROUP and DISTINCT and everything. Assets x compatible species/color pairs is a LOT of rows!

Here, we instead get all the relevant body IDs first, and *then* match them against pet types—which we fetch in one batch to match body to canonical species/color.

I'm also trashing the weird caching mechanism we did here, because in practice it doesn't seem reliable anyway. If anything, I'd want to look at stronger CDN caching. (I made a small improvement to the caching annotation, but ultimately it still doesn't matter, because this query uses logged-in stuff and always comes out max-age=0 anyway.)
This commit is contained in:
Emi Matchu 2021-03-18 12:57:29 -07:00
parent 1f103989a3
commit 0e8e50b054
3 changed files with 53 additions and 107 deletions

View file

@ -35,72 +35,3 @@ GRANT SELECT, UPDATE ON openneo_id.users TO impress2020;
-- mysqldump
GRANT LOCK TABLES ON * TO impress2020;
-- Procedures used in the application
DELIMITER $$
DROP PROCEDURE IF EXISTS GetItemsThatNeedModelsV2$$
CREATE PROCEDURE GetItemsThatNeedModelsV2()
BEGIN
SELECT pet_types.color_id AS color_id, items.id AS item_id,
GROUP_CONCAT(DISTINCT pet_types.species_id ORDER BY pet_types.species_id)
AS modeled_species_ids,
-- Vandagyre was added on 2014-11-14, so we add some buffer here.
-- TODO: Some later Dyeworks items don't support Vandagyre.
-- Add a manual db flag?
items.created_at >= "2014-12-01" AS supports_vandagyre
FROM items
INNER JOIN parents_swf_assets psa
ON psa.parent_type = "Item" AND psa.parent_id = items.id
INNER JOIN swf_assets
ON swf_assets.id = psa.swf_asset_id
INNER JOIN pet_types
ON pet_types.body_id = swf_assets.body_id
WHERE
pet_types.color_id IN (
8, -- Blue (Standard)
6, -- Baby
44, -- Maraquan
46 -- Mutant
)
AND items.modeling_status_hint IS NULL
GROUP BY color_id, item_id
HAVING
NOT (
-- No species (either an All Bodies item, or a Capsule type thing)
count(DISTINCT pet_types.species_id) = 0
-- Single species (probably just their item)
OR count(DISTINCT pet_types.species_id) = 1
-- All species modeled
OR count(DISTINCT pet_types.species_id) = 55
-- All species modeled except Vandagyre, for items that don't support it
OR (NOT supports_vandagyre AND modeled_species_ids = "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54")
)
ORDER BY color_id, item_id;
END$$
GRANT EXECUTE ON PROCEDURE GetItemsThatNeedModelsV2 TO impress2020$$
-- This procedure is a performance optimization! We want the page to always be
-- up-to-date, but we want to avoid re-running this query if modeling data
-- hasn't changed. So, we use the last contribution timestamp as a cache hint
-- for whether the data has expired. But in this environment, sequential
-- queries are a bottleneck, so I bundled up that logic into this single
-- procedure that can run on the database! That way, it's just one network
-- round-trip instead of two, which makes a noticeable difference in our stack.
DROP PROCEDURE IF EXISTS GetItemsThatNeedModelsIfNotCachedV2$$
CREATE PROCEDURE GetItemsThatNeedModelsIfNotCachedV2(
IN last_known_update TIMESTAMP,
OUT last_actual_update TIMESTAMP
)
BEGIN
SET last_actual_update =
(SELECT created_at FROM contributions ORDER BY id DESC LIMIT 1);
IF last_known_update < last_actual_update THEN
CALL GetItemsThatNeedModelsV2();
END IF;
END$$
GRANT EXECUTE ON PROCEDURE GetItemsThatNeedModelsIfNotCachedV2 TO impress2020$$
DELIMITER ;

View file

@ -410,8 +410,6 @@ const buildNewestItemsLoader = (db, loaders) =>
return [entities];
});
let lastKnownUpdate = "1970-01-01"; // start it out very old!
let lastResult = new Map();
const buildItemsThatNeedModelsLoader = (db) =>
new DataLoader(async (keys) => {
// Essentially, I want to take easy advantage of DataLoader's caching, for
@ -421,48 +419,65 @@ const buildItemsThatNeedModelsLoader = (db) =>
throw new Error(`this loader can only be loaded with the key "all"`);
}
// Call the query as a procedure, defined in `setup-mysql.sql`. It will
// only run the query if modeling data has been changed since the timestamp
// we provide; otherwise, it skips the query and returns no rows, which is
// much faster! (The query takes a few seconds to run.)
//
// NOTE: This query has the colors hardcoded, we always fetch all of them!
// And then we look up the specific colors
const [results, _] = await db.query(
const [rows, _] = await db.query(
`
SELECT T_ITEMS.item_id,
T_BODIES.color_id,
T_ITEMS.supports_vandagyre,
COUNT(*) AS modeled_species_count,
GROUP_CONCAT(
T_BODIES.species_id
ORDER BY T_BODIES.species_id
) AS modeled_species_ids
FROM (
-- NOTE: I found that extracting this as a separate query that runs
-- first made things WAAAY faster. Less to join/group, I guess?
SELECT DISTINCT items.id AS item_id,
swf_assets.body_id AS body_id,
-- Vandagyre was added on 2014-11-14, so we add some buffer here.
-- TODO: Some later Dyeworks items don't support Vandagyre.
-- Add a manual db flag?
items.created_at >= "2014-12-01" AS supports_vandagyre
FROM items
INNER JOIN parents_swf_assets psa ON psa.parent_type = "Item"
AND psa.parent_id = items.id
INNER JOIN swf_assets ON swf_assets.id = psa.swf_asset_id
INNER JOIN item_translations it ON it.item_id = items.id AND it.locale = "en"
WHERE items.modeling_status_hint IS NULL AND it.name NOT LIKE "%MME%"
ORDER BY item_id
) T_ITEMS
INNER JOIN (
SELECT DISTINCT body_id, species_id, color_id
FROM pet_types
WHERE color_id IN (6, 8, 44, 46)
ORDER BY body_id, species_id
) T_BODIES ON T_ITEMS.body_id = T_BODIES.body_id
GROUP BY T_ITEMS.item_id, T_BODIES.color_id
HAVING NOT (
-- No species (either an All Bodies item, or a Capsule type thing)
modeled_species_count = 0
-- Single species (probably just their item)
OR modeled_species_count = 1
-- All species modeled
OR modeled_species_count = 55
-- All species modeled except Vandagyre, for items that don't support it
OR (NOT T_ITEMS.supports_vandagyre AND modeled_species_count = 54 AND modeled_species_ids = "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54")
)
ORDER BY T_ITEMS.item_id;
`
CALL GetItemsThatNeedModelsIfNotCachedV2(?, @LastActualUpdate);
SELECT @LastActualUpdate;
`,
[lastKnownUpdate]
);
// The query will return 2 or 3 results.
// Result 1 (optional): The rows produced by the CALL, if it ran the query.
// Or, if it skipped the query, this is omitted.
// Result 2 (required): The MySQL summary of the effects of the CALL.
// Result 3 (required): The 1-row table contianing @LastActualUpdate.
//
// So, check the number of results. If it's 3, then we should update our
// cache. Or, if it's 2, then there was no change and we can continue with
// the existing cached value.
if (results.length === 3) {
const [rawRows, __, varRows] = results;
const rows = rawRows.map(normalizeRow);
const entities = rows.map(normalizeRow);
lastKnownUpdate = varRows[0]["@LastActualUpdate"];
// We build lastResult into a Map up-front, to speed up the many lookups
// that the GQL resolvers will do as we group this data into GQL nodes!
lastResult = new Map();
for (const { colorId, itemId, ...row } of rows) {
if (!lastResult.has(colorId)) {
lastResult.set(colorId, new Map());
}
lastResult.get(colorId).set(itemId, row);
const result = new Map();
for (const { colorId, itemId, ...entity } of entities) {
if (!result.has(colorId)) {
result.set(colorId, new Map());
}
result.get(colorId).set(itemId, entity);
}
return [lastResult];
return [result];
});
const buildItemBodiesWithAppearanceDataLoader = (db) =>

View file

@ -192,7 +192,7 @@ const typeDefs = gql`
# NOTE: Most color IDs won't be accepted here. Either pass the ID of a
# major special color like Baby (#6), or leave it blank for standard
# bodies like Blue, Green, Red, etc.
itemsThatNeedModels(colorId: ID): [Item!]!
itemsThatNeedModels(colorId: ID): [Item!]! @cacheControl(maxAge: 1, staleWhileRevalidate: ${oneHour})
}
extend type Mutation {