impress/lib/tasks/public_data.rake

require "open-uri"
require "open3"

desc "Tools to save and import DTI's public modeling data"
namespace :public_data do
	desc "Save the local database's public data to a local file"
	task :commit, [:name] => :environment do |_, args|
		if Rails.env.development?
			puts "NOTE: The `public_data:commit` task is primarily meant to be " +
				"run in production, to create public data files we can copy to our " +
				"development machines via `public_data:pull`. I'll still run it " +
				"locally and save to #{Rails.configuration.public_data_root}, though!"
		end

		# Generate a filename from the current time, and the option name argument
		# provided to the command (e.g. `rails public_data:commit[scheduled]`).
		timestamp = Time.now.utc.iso8601.gsub(':', '_')
		name = args.fetch(:name, "manual")
		filename = "#{timestamp}-#{name}.sql.gz"
		dest_path = Rails.configuration.public_data_root / filename

		args = []

		# The connection details for our database!
		config = ApplicationRecord.connection_db_config.configuration_hash
		args << "--host=#{config[:host]}" if config[:host]
		args << "--user=#{config[:username]}" if config[:username]
		args << "--password=#{config[:password]}" if config[:password]

		# Don't lock the database to do it!
		args << "--single-transaction"

		# Skip dumping tablespaces, so this requires fewer privileges.
		args << "--no-tablespaces"

		# Dump the public data tables from the primary database.
		args << config.fetch(:database)
		args += %w(species colors zones) # manual constants
		args += %w(alt_styles items parents_swf_assets pet_states pet_types
		           swf_assets) # from modeling

		# Set up a shell, and register the commands we need.
		Shell.def_system_command("mysqldump")
		Shell.def_system_command("gzip")
		sh = Shell.new

		# Ensure the output directory exists.
		dest_path.dirname.mkpath

		# Run mysqldump, pipe it into gzip, and output to the destination file.
		sh.transact do
			sh.mysqldump(*args) | sh.gzip("-c") > dest_path.to_s
		end

		puts "Saved dump to #{dest_path}"

		# Link this latest dump as `latest.sql.gz`.
		latest_path = Rails.configuration.public_data_root / "latest.sql.gz"
		File.unlink(latest_path) if File.symlink?(latest_path)
		File.symlink(filename, latest_path)

		puts "Linked dump to #{latest_path}"
	end

	desc "Pull and import the latest public data from production (dev only)"
	task :pull => ["db:abort_if_pending_migrations", :environment] do
		unless Rails.env.development?
			raise "Can only pull public data in development mode! This helps us " +
				"ensure we won't overwrite the production database accidentally."
		end

		args = []

		# The connection details for our database!
		config = ApplicationRecord.connection_db_config.configuration_hash
		args << "--host=#{config[:host]}" if config[:host]
		args << "--ssl=false" # SSL is the default for recent MariaDB; override!
		args << "--user=#{config[:username]}" if config[:username]
		args << "--password=#{config[:password]}" if config[:password]
		args << "--database=#{config.fetch(:database)}"

		# Set up a shell, and register the commands we need.
		Shell.def_system_command("mysql")
		Shell.def_system_command("gunzip")
		sh = Shell.new

		URI.open("https://impress.openneo.net/public-data/latest.sql.gz") do |file|
			# Pipe the latest public data SQL into `gunzip` to unpack it, then pipe
			# it into mysql to execute it.
			#
			# NOTE: We need `open(file)` to wrap it in a plain `File` object, so the
			# `Shell` will recognize it correctly! It doesn't accept `Tempfile`.
			sh.transact do
				(sh.gunzip("-c") < open(file)) | sh.mysql(*args)
			end
		end
	end
end
Create `rails public_data:pull` task, to load up the latest public data Yay, it works! Easy peasy! Love this way of integrating shell and Ruby, it's cute! 2024-03-01 13:18:58 -08:00			`require "open-uri"`
Create `rails public_data:commit` task, to share public data dumps I'm starting to port over the functionality that was previously just, me running `yarn db:export:public-data` in `impress-2020` and committing it to Git LFS every time. My immediate motivation is that the `impress-2020` git repository is getting weirdly large?? Idk how these 40MB files have blown up to a solid 16GB of Git LFS data (we don't have THAT many!!!), but I guess there's something about Git LFS's architecture and disk usage that I'm not understanding. So, let's move to a simpler system in which we don't bind the public data to the codebase, but instead just regularly dump it in production and make it available for download. This change adds the `rails public_data:commit` task, which when run in production will make the latest available at `https://impress.openneo.net/public-data/latest.sql.gz`, and will also store a running log of previous dumps, viewable at `https://impress.openneo.net/public-data/`. Things left to do: 1. Create a `rails public_data:pull` task, to download `latest.sql.gz` and import it into the local development database. 2. Set up a cron job to dump this out regularly, idk maybe weekly? That will grow, but not very fast (about 2GB per year), and we can add logic to rotate out old ones if it starts to grow too far. (If we wanted to get really intricate, we could do like, daily for the past week, then weekly for the past 3 months, then monthly for the past year, idk. There must be tools that do this!) 2024-02-29 14:30:33 -08:00			`require "open3"`

			`desc "Tools to save and import DTI's public modeling data"`
			`namespace :public_data do`
			`desc "Save the local database's public data to a local file"`
			`task :commit, [:name] => :environment do \|_, args\|`
			`if Rails.env.development?`
			puts "NOTE: The `public_data:commit` task is primarily meant to be " +
			`"run in production, to create public data files we can copy to our " +`
			"development machines via `public_data:pull`. I'll still run it " +
			`"locally and save to #{Rails.configuration.public_data_root}, though!"`
			`end`

			`# Generate a filename from the current time, and the option name argument`
			# provided to the command (e.g. `rails public_data:commit[scheduled]`).
			`timestamp = Time.now.utc.iso8601.gsub(':', '_')`
			`name = args.fetch(:name, "manual")`
			`filename = "#{timestamp}-#{name}.sql.gz"`
			`dest_path = Rails.configuration.public_data_root / filename`

			`args = []`

			`# The connection details for our database!`
Create `rails public_data:pull` task, to load up the latest public data Yay, it works! Easy peasy! Love this way of integrating shell and Ruby, it's cute! 2024-03-01 13:18:58 -08:00			`config = ApplicationRecord.connection_db_config.configuration_hash`
Create `rails public_data:commit` task, to share public data dumps I'm starting to port over the functionality that was previously just, me running `yarn db:export:public-data` in `impress-2020` and committing it to Git LFS every time. My immediate motivation is that the `impress-2020` git repository is getting weirdly large?? Idk how these 40MB files have blown up to a solid 16GB of Git LFS data (we don't have THAT many!!!), but I guess there's something about Git LFS's architecture and disk usage that I'm not understanding. So, let's move to a simpler system in which we don't bind the public data to the codebase, but instead just regularly dump it in production and make it available for download. This change adds the `rails public_data:commit` task, which when run in production will make the latest available at `https://impress.openneo.net/public-data/latest.sql.gz`, and will also store a running log of previous dumps, viewable at `https://impress.openneo.net/public-data/`. Things left to do: 1. Create a `rails public_data:pull` task, to download `latest.sql.gz` and import it into the local development database. 2. Set up a cron job to dump this out regularly, idk maybe weekly? That will grow, but not very fast (about 2GB per year), and we can add logic to rotate out old ones if it starts to grow too far. (If we wanted to get really intricate, we could do like, daily for the past week, then weekly for the past 3 months, then monthly for the past year, idk. There must be tools that do this!) 2024-02-29 14:30:33 -08:00			`args << "--host=#{config[:host]}" if config[:host]`
			`args << "--user=#{config[:username]}" if config[:username]`
			`args << "--password=#{config[:password]}" if config[:password]`

			`# Don't lock the database to do it!`
			`args << "--single-transaction"`

Require fewer db privileges to run `public_data:commit` In newer versions of MySQL, `mysqldump`'s default behavior requires accessing some privileged `INFORMATION_SCHEMA` tables, which requires the global `PROCESS` permission. Rather than require that, we can just skip this step, by adding the `--no-tablespaces` argument. This was the guidance I found when looking up this issue! https://dba.stackexchange.com/a/274460/289961 2024-05-02 13:06:27 -07:00			`# Skip dumping tablespaces, so this requires fewer privileges.`
			`args << "--no-tablespaces"`

Create `rails public_data:commit` task, to share public data dumps I'm starting to port over the functionality that was previously just, me running `yarn db:export:public-data` in `impress-2020` and committing it to Git LFS every time. My immediate motivation is that the `impress-2020` git repository is getting weirdly large?? Idk how these 40MB files have blown up to a solid 16GB of Git LFS data (we don't have THAT many!!!), but I guess there's something about Git LFS's architecture and disk usage that I'm not understanding. So, let's move to a simpler system in which we don't bind the public data to the codebase, but instead just regularly dump it in production and make it available for download. This change adds the `rails public_data:commit` task, which when run in production will make the latest available at `https://impress.openneo.net/public-data/latest.sql.gz`, and will also store a running log of previous dumps, viewable at `https://impress.openneo.net/public-data/`. Things left to do: 1. Create a `rails public_data:pull` task, to download `latest.sql.gz` and import it into the local development database. 2. Set up a cron job to dump this out regularly, idk maybe weekly? That will grow, but not very fast (about 2GB per year), and we can add logic to rotate out old ones if it starts to grow too far. (If we wanted to get really intricate, we could do like, daily for the past week, then weekly for the past 3 months, then monthly for the past year, idk. There must be tools that do this!) 2024-02-29 14:30:33 -08:00			`# Dump the public data tables from the primary database.`
			`args << config.fetch(:database)`
			`args += %w(species colors zones) # manual constants`
			`args += %w(alt_styles items parents_swf_assets pet_states pet_types`
			`swf_assets) # from modeling`

			`# Set up a shell, and register the commands we need.`
			`Shell.def_system_command("mysqldump")`
			`Shell.def_system_command("gzip")`
			`sh = Shell.new`

			`# Ensure the output directory exists.`
			`dest_path.dirname.mkpath`

			`# Run mysqldump, pipe it into gzip, and output to the destination file.`
Create `rails public_data:pull` task, to load up the latest public data Yay, it works! Easy peasy! Love this way of integrating shell and Ruby, it's cute! 2024-03-01 13:18:58 -08:00			`sh.transact do`
			`sh.mysqldump(*args) \| sh.gzip("-c") > dest_path.to_s`
			`end`

Create `rails public_data:commit` task, to share public data dumps I'm starting to port over the functionality that was previously just, me running `yarn db:export:public-data` in `impress-2020` and committing it to Git LFS every time. My immediate motivation is that the `impress-2020` git repository is getting weirdly large?? Idk how these 40MB files have blown up to a solid 16GB of Git LFS data (we don't have THAT many!!!), but I guess there's something about Git LFS's architecture and disk usage that I'm not understanding. So, let's move to a simpler system in which we don't bind the public data to the codebase, but instead just regularly dump it in production and make it available for download. This change adds the `rails public_data:commit` task, which when run in production will make the latest available at `https://impress.openneo.net/public-data/latest.sql.gz`, and will also store a running log of previous dumps, viewable at `https://impress.openneo.net/public-data/`. Things left to do: 1. Create a `rails public_data:pull` task, to download `latest.sql.gz` and import it into the local development database. 2. Set up a cron job to dump this out regularly, idk maybe weekly? That will grow, but not very fast (about 2GB per year), and we can add logic to rotate out old ones if it starts to grow too far. (If we wanted to get really intricate, we could do like, daily for the past week, then weekly for the past 3 months, then monthly for the past year, idk. There must be tools that do this!) 2024-02-29 14:30:33 -08:00			`puts "Saved dump to #{dest_path}"`

			# Link this latest dump as `latest.sql.gz`.
			`latest_path = Rails.configuration.public_data_root / "latest.sql.gz"`
Fix public_data:commit's symlinking on some platforms Huh, curious, I think what I'm seeing is: on my development machine, `File.exist?` returns true for symlinks, but, on our production machine, `File.exist?` returns false for symlinks. I imagine this is a difference in the implementation of the underlying system calls? Curious! This new check should work more reliably across platforms. I considered checking both `exists?` and `symlink?`, but decided that, in the unexpected case that `latest.sql.gz` exists but is an actual file instead of a symlink like we expect, it's probably best to avoid overwriting it anyway, and a crash on the `symlink` attempt is a reasonable way to do that. 2024-05-02 13:10:30 -07:00			`File.unlink(latest_path) if File.symlink?(latest_path)`
Oops, fix symlink for `/public-data/latest.sql.gz` Oh whoops, I was symlinking to the full path of the latest dump, which includes the site version directory in it. This meant that, if 5 new versions of the app were deployed since the most recently public data commit (and so that version is deleted), the symlink fails. In this change, we just symlink to the filename, which behaves as a relative path and should be completely resilient to deploys changing where these files ostensibly live!! 2024-05-29 19:01:23 -07:00			`File.symlink(filename, latest_path)`
Create `rails public_data:pull` task, to load up the latest public data Yay, it works! Easy peasy! Love this way of integrating shell and Ruby, it's cute! 2024-03-01 13:18:58 -08:00
Create `rails public_data:commit` task, to share public data dumps I'm starting to port over the functionality that was previously just, me running `yarn db:export:public-data` in `impress-2020` and committing it to Git LFS every time. My immediate motivation is that the `impress-2020` git repository is getting weirdly large?? Idk how these 40MB files have blown up to a solid 16GB of Git LFS data (we don't have THAT many!!!), but I guess there's something about Git LFS's architecture and disk usage that I'm not understanding. So, let's move to a simpler system in which we don't bind the public data to the codebase, but instead just regularly dump it in production and make it available for download. This change adds the `rails public_data:commit` task, which when run in production will make the latest available at `https://impress.openneo.net/public-data/latest.sql.gz`, and will also store a running log of previous dumps, viewable at `https://impress.openneo.net/public-data/`. Things left to do: 1. Create a `rails public_data:pull` task, to download `latest.sql.gz` and import it into the local development database. 2. Set up a cron job to dump this out regularly, idk maybe weekly? That will grow, but not very fast (about 2GB per year), and we can add logic to rotate out old ones if it starts to grow too far. (If we wanted to get really intricate, we could do like, daily for the past week, then weekly for the past 3 months, then monthly for the past year, idk. There must be tools that do this!) 2024-02-29 14:30:33 -08:00			`puts "Linked dump to #{latest_path}"`
			`end`

			`desc "Pull and import the latest public data from production (dev only)"`
Only run `public_data:pull` if there are no pending migrations Oh this was a fun little dev environment bug: I ran `public_data:pull` on my laptop before migrating my database, so the `items` table pulled as the latest production version, which included the migrations, but they hadn't been marked as "run" yet. So Rails was still telling me I needed to run them, but the migrations themselves were crashing, with stuff like "there's already a column with this name!" This change ensures that `public_data:pull` won't run until migrations are done, to prevent silly accidents like that. 2024-06-18 14:52:54 -07:00			`task :pull => ["db:abort_if_pending_migrations", :environment] do`
Create `rails public_data:commit` task, to share public data dumps I'm starting to port over the functionality that was previously just, me running `yarn db:export:public-data` in `impress-2020` and committing it to Git LFS every time. My immediate motivation is that the `impress-2020` git repository is getting weirdly large?? Idk how these 40MB files have blown up to a solid 16GB of Git LFS data (we don't have THAT many!!!), but I guess there's something about Git LFS's architecture and disk usage that I'm not understanding. So, let's move to a simpler system in which we don't bind the public data to the codebase, but instead just regularly dump it in production and make it available for download. This change adds the `rails public_data:commit` task, which when run in production will make the latest available at `https://impress.openneo.net/public-data/latest.sql.gz`, and will also store a running log of previous dumps, viewable at `https://impress.openneo.net/public-data/`. Things left to do: 1. Create a `rails public_data:pull` task, to download `latest.sql.gz` and import it into the local development database. 2. Set up a cron job to dump this out regularly, idk maybe weekly? That will grow, but not very fast (about 2GB per year), and we can add logic to rotate out old ones if it starts to grow too far. (If we wanted to get really intricate, we could do like, daily for the past week, then weekly for the past 3 months, then monthly for the past year, idk. There must be tools that do this!) 2024-02-29 14:30:33 -08:00			`unless Rails.env.development?`
			`raise "Can only pull public data in development mode! This helps us " +`
			`"ensure we won't overwrite the production database accidentally."`
			`end`

Create `rails public_data:pull` task, to load up the latest public data Yay, it works! Easy peasy! Love this way of integrating shell and Ruby, it's cute! 2024-03-01 13:18:58 -08:00			`args = []`

			`# The connection details for our database!`
			`config = ApplicationRecord.connection_db_config.configuration_hash`
			`args << "--host=#{config[:host]}" if config[:host]`
Ensure `rails public_data:pull` doesn't use SSL for localhost I upgraded our local MariaDB for compatibility with the latest server dumps (https://mariadb.org/mariadb-dump-file-compatibility-change/), and I thiiiink what I'm seeing is that, also in this version of MariaDB, the default value for the `ssl` option is `true`? That is, command-line clients will try to connect over SSL by default—which isn't generally supported on development servers, where this task runs. I could probably fix this with a change to my local config? But I figure I can't really picture a scenario where this option being set in the task would be wrong, but I can see it saving future people time if they're working in a similar environment. So, let's just set it! 2024-07-07 17:21:54 -07:00			`args << "--ssl=false" # SSL is the default for recent MariaDB; override!`
Create `rails public_data:pull` task, to load up the latest public data Yay, it works! Easy peasy! Love this way of integrating shell and Ruby, it's cute! 2024-03-01 13:18:58 -08:00			`args << "--user=#{config[:username]}" if config[:username]`
			`args << "--password=#{config[:password]}" if config[:password]`
			`args << "--database=#{config.fetch(:database)}"`

			`# Set up a shell, and register the commands we need.`
			`Shell.def_system_command("mysql")`
			`Shell.def_system_command("gunzip")`
			`sh = Shell.new`

			`URI.open("https://impress.openneo.net/public-data/latest.sql.gz") do \|file\|`
			# Pipe the latest public data SQL into `gunzip` to unpack it, then pipe
			`# it into mysql to execute it.`
			`#`
			# NOTE: We need `open(file)` to wrap it in a plain `File` object, so the
			# `Shell` will recognize it correctly! It doesn't accept `Tempfile`.
			`sh.transact do`
			`(sh.gunzip("-c") < open(file)) \| sh.mysql(*args)`
			`end`
			`end`
Create `rails public_data:commit` task, to share public data dumps I'm starting to port over the functionality that was previously just, me running `yarn db:export:public-data` in `impress-2020` and committing it to Git LFS every time. My immediate motivation is that the `impress-2020` git repository is getting weirdly large?? Idk how these 40MB files have blown up to a solid 16GB of Git LFS data (we don't have THAT many!!!), but I guess there's something about Git LFS's architecture and disk usage that I'm not understanding. So, let's move to a simpler system in which we don't bind the public data to the codebase, but instead just regularly dump it in production and make it available for download. This change adds the `rails public_data:commit` task, which when run in production will make the latest available at `https://impress.openneo.net/public-data/latest.sql.gz`, and will also store a running log of previous dumps, viewable at `https://impress.openneo.net/public-data/`. Things left to do: 1. Create a `rails public_data:pull` task, to download `latest.sql.gz` and import it into the local development database. 2. Set up a cron job to dump this out regularly, idk maybe weekly? That will grow, but not very fast (about 2GB per year), and we can add logic to rotate out old ones if it starts to grow too far. (If we wanted to get really intricate, we could do like, daily for the past week, then weekly for the past 3 months, then monthly for the past year, idk. There must be tools that do this!) 2024-02-29 14:30:33 -08:00			`end`
			`end`