diff --git a/app/controllers/api/v2/works_controller.rb b/app/controllers/api/v2/works_controller.rb index 3dee15833ef..6557d24774d 100644 --- a/app/controllers/api/v2/works_controller.rb +++ b/app/controllers/api/v2/works_controller.rb @@ -127,7 +127,7 @@ def find_work_by_import_url(original_url) message = "Please provide the original URL for the work." else # We know the url will be identical no need for a call to find_by_url - works = Work.where(imported_from_url: original_url) + works = Work.joins(:imported_url).where(imported_url: { original: original_url }) message = if works.empty? "No work has been imported from \"" + original_url + "\"." else diff --git a/app/controllers/opendoors/tools_controller.rb b/app/controllers/opendoors/tools_controller.rb index 1f3cb805a64..f87e7b5b461 100644 --- a/app/controllers/opendoors/tools_controller.rb +++ b/app/controllers/opendoors/tools_controller.rb @@ -46,12 +46,16 @@ def url_update flash[:error] = ts("The imported-from url you are trying to set doesn't seem valid.") else # check for any other works - works = Work.where(imported_from_url: @imported_from_url) + works = Work.joins(:imported_url).where(imported_url: { original: @imported_from_url }) if works.count > 0 flash[:error] = ts("There is already a work imported from the url %{url}.", url: @imported_from_url) else # ok let's try to update - @work.update_attribute(:imported_from_url, @imported_from_url) + if @work.imported_url.nil? + @work.imported_url = ImportedUrl.create(original: @imported_from_url) + else + @work.imported_url.update(original: @imported_from_url) + end flash[:notice] = "Updated imported-from url for #{@work.title} to #{@imported_from_url}" end end diff --git a/app/models/search/work_indexer.rb b/app/models/search/work_indexer.rb index f7c4c260143..c3a0dbcb6d6 100644 --- a/app/models/search/work_indexer.rb +++ b/app/models/search/work_indexer.rb @@ -55,9 +55,6 @@ def self.mapping title_to_sort_on: { type: "keyword" }, - imported_from_url: { - type: "keyword" - }, work_types: { type: "keyword" }, diff --git a/app/models/story_parser.rb b/app/models/story_parser.rb index 5d60e573386..fec082e78ae 100644 --- a/app/models/story_parser.rb +++ b/app/models/story_parser.rb @@ -302,8 +302,7 @@ def set_work_attributes(work, location = "", options = {}) raise Error, "Work could not be downloaded" if work.nil? @options = options - work.imported_from_url = location # @todo remove this as part of AO3-6979 - work.imported_url = ImportedUrl.new(original: work.imported_from_url) + work.imported_url = ImportedUrl.new(original: location) work.ip_address = options[:ip_address] work.expected_number_of_chapters = work.chapters.length diff --git a/app/models/work.rb b/app/models/work.rb index 9a10e1dd0b6..74e7883b316 100755 --- a/app/models/work.rb +++ b/app/models/work.rb @@ -298,7 +298,7 @@ def expire_caches series.each(&:expire_caches) Work.expire_work_blurb_version(id) - Work.flush_find_by_url_cache unless imported_from_url.blank? + Work.flush_find_by_url_cache unless imported_url.nil? end def update_pseud_and_collection_indexes @@ -419,16 +419,21 @@ def self.find_by_url_cache_key(url) def self.find_by_url_uncached(url) url = UrlFormatter.new(url) - Work.where(imported_from_url: url.original).first || - Work.where(imported_from_url: [url.minimal, - url.with_http, url.with_https, - url.no_www, url.with_www, - url.encoded, url.decoded, - url.minimal_no_protocol_no_www]).first || - Work.where("imported_from_url LIKE ? or imported_from_url LIKE ?", - "http://#{url.minimal_no_protocol_no_www}%", - "https://#{url.minimal_no_protocol_no_www}%").select do |w| - work_url = UrlFormatter.new(w.imported_from_url) + + Work.joins(:imported_url).where(imported_url: { original: @imported_from_url }).first || + Work.joins(:imported_url).where(imported_url: { minimal: url.minimal }).first || + Work.joins(:imported_url).where(imported_url: { with_http: url.with_http }).first || + Work.joins(:imported_url).where(imported_url: { with_https: url.with_https }).first || + Work.joins(:imported_url).where(imported_url: { no_www: url.no_www }).first || + Work.joins(:imported_url).where(imported_url: { with_www: url.with_www }).first || + Work.joins(:imported_url).where(imported_url: { encoded: url.encoded }).first || + Work.joins(:imported_url).where(imported_url: { decoded: url.decoded }).first || + Work.joins(:imported_url).where(imported_url: { minimal_no_protocol_no_www: url.minimal_no_protocol_no_www }).first || + Work.joins(:imported_url).where(imported_url: { minimal: url.minimal }).first || + Work.joins(:imported_url).where("imported_urls.original LIKE ? or imported_urls.original LIKE ?", + "http://#{url.minimal_no_protocol_no_www}%", + "https://#{url.minimal_no_protocol_no_www}%").select do |w| + work_url = UrlFormatter.new(w.imported_url&.original) %w[original minimal no_www with_www with_http with_https encoded decoded].any? do |method| work_url.send(method) == url.send(method) end diff --git a/db/migrate/20260519171051_remove_imported_from_url_field_from_works.rb b/db/migrate/20260519171051_remove_imported_from_url_field_from_works.rb new file mode 100644 index 00000000000..fd5b3e7d226 --- /dev/null +++ b/db/migrate/20260519171051_remove_imported_from_url_field_from_works.rb @@ -0,0 +1,12 @@ +class RemoveImportedFromUrlFieldFromWorks < ActiveRecord::Migration[8.1] + uses_departure! if Rails.env.staging? || Rails.env.production? + + def up + remove_index :works, :imported_from_url + remove_column :works, :imported_from_url + end + + def down + raise ActiveRecord::IrreversibleMigration, "This migration cannot be reverted because we can't restore the imported_from_url column data" + end +end diff --git a/lib/tasks/opendoors.rake b/lib/tasks/opendoors.rake index da057d2303f..0f6f4208cd3 100644 --- a/lib/tasks/opendoors.rake +++ b/lib/tasks/opendoors.rake @@ -1,19 +1,19 @@ namespace :opendoors do - - class UrlUpdater - def update_work(row) - begin - work = Work.find(row["AO3 id"]) - if work&.imported_from_url.blank? || work&.imported_from_url == row["URL Imported From"] - work.imported_from_url = row["Original URL"] - work.save! - "#{work.id}\twas updated: its import url is now #{work.imported_from_url}" - else - "#{work.id}\twas not changed: its import url is #{work.imported_from_url}" - end - rescue StandardError => e - "#{row["AO3 id"]}\twas not changed: #{e}" + def update_work(row) + begin + work = Work.find(row["AO3 id"]) + # ImportedUrl needs an original value, so we use the imported url temporarily and only save if we're actually updating it + work.imported_url = ImportedUrl.new(:original => row["URL Imported From"]) if work.imported_url.nil? + + if work.imported_url.original == row["URL Imported From"] + work.imported_url.original = row["Original URL"] + work.imported_url.save! + "#{work.id}\twas updated: its import url is now #{work.imported_url.original}" + else + "#{work.id}\twas not changed: its import url is #{work.imported_url.original}" end + rescue StandardError => e + "#{row["AO3 id"]}\twas not changed: #{e}" end end @@ -28,10 +28,8 @@ namespace :opendoors do begin f = File.open("opendoors_result.txt", "w") - url_updater = UrlUpdater.new - CSV.foreach(loc, headers: true) do |row| - result = url_updater.update_work(row) + result = update_work(row) puts result f.write(result) end diff --git a/lib/tasks/work_import_urls.rake b/lib/tasks/work_import_urls.rake deleted file mode 100644 index 49b5a8b3e54..00000000000 --- a/lib/tasks/work_import_urls.rake +++ /dev/null @@ -1,30 +0,0 @@ -namespace :work_import_urls do - desc "Backfill work_import_urls from works.imported_from_url (run BEFORE removing the column)" - task backfill: :environment do - batch_size = 1000 - - # Query the column directly since this task runs before column removal - scope = Work.where("imported_from_url IS NOT NULL AND imported_from_url != ''") - total = scope.count - processed = 0 - failed = 0 - - puts "Backfilling #{total} work_import_urls records..." - - scope.find_each(batch_size: batch_size) do |work| - next if ImportedUrl.exists?(work_id: work.id) - - begin - work.imported_url = ImportedUrl.create(original: work.imported_from_url) - rescue StandardError => e - puts "------- Error on import for work_id #{work.id}: #{e.inspect}" - failed += 1 - next - end - processed += 1 - puts "Processed #{processed}/#{total}" if (processed % batch_size).zero? - end - - puts "Done! Backfilled #{processed} records, failed #{failed} records." - end -end diff --git a/spec/controllers/api/v2/api_works_search_spec.rb b/spec/controllers/api/v2/api_works_search_spec.rb index d2365172510..26a694eec3f 100644 --- a/spec/controllers/api/v2/api_works_search_spec.rb +++ b/spec/controllers/api/v2/api_works_search_spec.rb @@ -11,7 +11,11 @@ def post_search_result(valid_params) describe "API v2 WorksController - Search", type: :request, work_search: true do describe "valid work URL request" do - let!(:work) { create(:work, imported_from_url: "foo") } + let!(:work) do + work = create(:work) + work.imported_url = ImportedUrl.create(original: "foo") + work + end it "returns 200 OK" do valid_params = { works: [{ original_urls: %w(bar foo) }] } diff --git a/spec/controllers/api/v2/api_works_spec.rb b/spec/controllers/api/v2/api_works_spec.rb index 8b2174b8298..2a8ae3a2f30 100644 --- a/spec/controllers/api/v2/api_works_spec.rb +++ b/spec/controllers/api/v2/api_works_spec.rb @@ -514,7 +514,8 @@ end it "work_url_from_external returns the work url when a work is found" do - work1 = create(:work, imported_from_url: "http://foo.com") + work1 = create(:work) + work1.imported_url = ImportedUrl.new(original: "http://foo.com") work_url_response = @under_test.instance_eval { find_work_by_import_url("http://foo.com") } expect(work_url_response[:works].first).to eq work1 end diff --git a/spec/controllers/opendoors/tools_controller_spec.rb b/spec/controllers/opendoors/tools_controller_spec.rb index 2cab4b01877..8f2418d8694 100644 --- a/spec/controllers/opendoors/tools_controller_spec.rb +++ b/spec/controllers/opendoors/tools_controller_spec.rb @@ -67,7 +67,11 @@ context "with a valid work ID" do let(:work) { create(:work) } - let(:work_with_imported_from_url) { create(:work, imported_from_url: "http://example.org/my-immortal") } + let!(:work_with_imported_from_url) do + work = create(:work) + work.imported_url = ImportedUrl.create(original: "http://example.org/my-immortal") + work + end it "redirects to tools if imported-from URL is missing" do post :url_update, params: { work_url: "/works/#{work.id}/" } @@ -80,7 +84,7 @@ end it "redirects to tools if imported-from URL is already used in another work" do - url = work_with_imported_from_url.imported_from_url + url = work_with_imported_from_url.imported_url.original post :url_update, params: { work_url: "/works/#{work.id}/", imported_from_url: url } it_redirects_to_with_error(opendoors_tools_path(imported_from_url: url), "There is already a work imported from the url #{url}.") end @@ -90,7 +94,7 @@ post :url_update, params: { work_url: "http://example.org/works/#{work.id}/", imported_from_url: url } it_redirects_to_with_notice(opendoors_tools_path(imported_from_url: url), "Updated imported-from url for #{work.title} to #{url}") work.reload - expect(work.imported_from_url).to eq(url) + expect(work.imported_url.original).to eq(url) end it "updates work if imported-from URL has non-ASCII characters" do @@ -99,7 +103,7 @@ encoded_url = URI::Parser.new.escape(url) it_redirects_to_with_notice(opendoors_tools_path(imported_from_url: encoded_url), "Updated imported-from url for #{work.title} to #{encoded_url}") work.reload - expect(work.imported_from_url).to eq(encoded_url) + expect(work.imported_url.original).to eq(encoded_url) end end end diff --git a/spec/lib/tasks/opendoors.rake_spec.rb b/spec/lib/tasks/opendoors.rake_spec.rb index 380d71bc08c..92b8297d398 100644 --- a/spec/lib/tasks/opendoors.rake_spec.rb +++ b/spec/lib/tasks/opendoors.rake_spec.rb @@ -8,8 +8,12 @@ original_url = "http://another/1" original_url2 = "http://another/2" - let!(:work_with_temp_url) { create(:work, id: id1, imported_from_url: temp_url) } - let!(:work_with_no_url) { create(:work, id: id2, imported_from_url: nil) } + let!(:work_with_temp_url) { + work_with_temp_url = create(:work, id: id1) + work_with_temp_url.imported_url = ImportedUrl.create(original: temp_url) + work_with_temp_url + } + let!(:work_with_no_url) { create(:work, id: id2) } after do File.delete("opendoors_result.txt") if File.exist?("opendoors_result.txt") @@ -19,8 +23,8 @@ it "takes a path to a CSV and updates works" do path = file_fixture("opendoors_import_url_mapping.csv") subject.invoke(path) - expect(Work.find(work_with_temp_url.id).imported_from_url).to eq(original_url) - expect(Work.find(work_with_no_url.id).imported_from_url).to eq(original_url2) + expect(Work.find(work_with_temp_url.id).imported_url.original).to eq(original_url) + expect(Work.find(work_with_no_url.id).imported_url.original).to eq(original_url2) end it "fails if the CSV path is invalid" do expect { subject.invoke("not a path") }.to raise_error Errno::ENOENT @@ -28,8 +32,11 @@ end context "UrlUpdater" do - let(:url_updater) { UrlUpdater.new } - let(:work_with_other_url) { create(:work, imported_from_url: "http://another/1") } + let!(:work_with_other_url) do + work_with_other_url = create(:work) + work_with_other_url.imported_url = ImportedUrl.create(original: "http://another/1") + work_with_other_url + end it "returns an error if the work is not found" do row = { @@ -37,7 +44,7 @@ "Original URL" => "http://another/2", "AO3 id" => 7777773 } - result = url_updater.update_work(row) + result = update_work(row) expect(result).to eq("7777773\twas not changed: Couldn't find Work with 'id'=7777773") end @@ -47,7 +54,7 @@ "Original URL" => "http://another/2", "AO3 id" => work_with_other_url.id } - result = url_updater.update_work(row) + result = update_work(row) expect(result).to match("\\d+\twas not changed: its import url is http://another/1") end @@ -57,7 +64,7 @@ "Original URL" => "http://another/2", "AO3 id" => work_with_no_url.id } - result = url_updater.update_work(row) + result = update_work(row) expect(result).to match("\\d+\twas updated: its import url is now http://another/2") end @@ -67,7 +74,7 @@ "Original URL" => "http://another/2", "AO3 id" => work_with_temp_url.id } - result = url_updater.update_work(row) + result = update_work(row) expect(result).to match("\\d+\twas updated: its import url is now http://another/2") end end diff --git a/spec/lib/tasks/work_import_urls.rake_spec.rb b/spec/lib/tasks/work_import_urls.rake_spec.rb deleted file mode 100644 index 77270a1d4ae..00000000000 --- a/spec/lib/tasks/work_import_urls.rake_spec.rb +++ /dev/null @@ -1,23 +0,0 @@ -require "spec_helper" -require "rake" - -describe "rake work_import_urls:backfill" do - it "should import the urls" do - work = create(:work) - work.imported_from_url = "http://www.trickster.org/llwyden/misc/cracked.html" - work.save!(validate: false) - - work_not_migrate = create(:work) - work_not_migrate.imported_from_url = "" - work_not_migrate.save!(validate: false) - - subject.invoke - - expect(ImportedUrl.count).to eq(1) - expect(ImportedUrl.first.original).to eq(work.imported_from_url) - - subject.invoke - - expect(ImportedUrl.count).to eq(1) - end -end diff --git a/spec/models/story_parser_spec.rb b/spec/models/story_parser_spec.rb index 7cd2cc532d7..60d1464d26e 100644 --- a/spec/models/story_parser_spec.rb +++ b/spec/models/story_parser_spec.rb @@ -117,19 +117,22 @@ class StoryParser let(:location_partial_match) { "http://testme.org/welcome_to_test_vale/12345" } it "should recognise previously imported www. works" do - @work = FactoryBot.create(:work, imported_from_url: location_with_www) + @work = FactoryBot.create(:work) + @work.imported_url = ImportedUrl.new(original: location_with_www) expect { @sp.check_for_previous_import(location_no_www) }.to raise_exception(StoryParser::Error) end it "should recognise previously imported non-www. works" do - @work = FactoryBot.create(:work, imported_from_url: location_no_www) + @work = FactoryBot.create(:work) + @work.imported_url = ImportedUrl.new(original: location_no_www) expect { @sp.check_for_previous_import(location_with_www) }.to raise_exception(StoryParser::Error) end it "should not perform a partial match on work import locations" do - @work = create(:work, imported_from_url: location_partial_match) + @work = create(:work) + @work.imported_url = ImportedUrl.new(original: location_partial_match) expect { @sp.check_for_previous_import("http://testme.org/welcome_to_test_vale/123") }.to_not raise_exception end diff --git a/spec/models/work_spec.rb b/spec/models/work_spec.rb index ad4a102125f..63cdf502aee 100644 --- a/spec/models/work_spec.rb +++ b/spec/models/work_spec.rb @@ -360,7 +360,8 @@ http://lj-site.com/bar/foo?color=blue https://www.lj-site.com/bar/foo?color=blue http://www.foo.com/bar https://www.foo.com/bar].each do |url| - work = create(:work, imported_from_url: url) + work = create(:work) + work.imported_url = ImportedUrl.create(original: url) expect(Work.find_by_url(url)).to eq(work) work.destroy end @@ -372,33 +373,38 @@ "http://efiction-site.com/viewstory.php?sid=123" => "http://efiction-site.com/viewstory.php?sid=456", "http://www.foo.com/i-am-something" => "http://foo.com/i-am-something/else" }.each do |import_url, find_url| - work = create(:work, imported_from_url: import_url) + work = create(:work) + work.imported_url = ImportedUrl.create(original: import_url) expect(Work.find_by_url(find_url)).to_not eq(work) work.destroy end end it "should find works imported with irrelevant query parameters" do - work = create(:work, imported_from_url: "http://lj-site.com/thing1?style=mine") + work = create(:work) + work.imported_url = ImportedUrl.create(original: "http://lj-site.com/thing1?style=mine") expect(Work.find_by_url("http://lj-site.com/thing1?style=other")).to eq(work) work.destroy end it "finds works imported with HTTP protocol and irrelevant query parameters" do - work = create(:work, imported_from_url: "http://lj-site.com/thing1?style=mine") + work = create(:work) + work.imported_url = ImportedUrl.create(original: "http://lj-site.com/thing1?style=mine") expect(Work.find_by_url("https://lj-site.com/thing1?style=other")).to eq(work) work.destroy end it "finds works imported with HTTPS protocol and irrelevant query parameters" do - work = create(:work, imported_from_url: "https://lj-site.com/thing1?style=mine") + work = create(:work) + work.imported_url = ImportedUrl.create(original: "https://lj-site.com/thing1?style=mine") expect(Work.find_by_url("http://lj-site.com/thing1?style=other")).to eq(work) work.destroy end it "gets the work from cache when searching for an imported work by URL" do url = "http://lj-site.com/thing2" - work = create(:work, imported_from_url: url) + work = create(:work) + work.imported_url = ImportedUrl.new(original: url) expect(Rails.cache.read(Work.find_by_url_cache_key(url))).to be_nil expect(Work.find_by_url(url)).to eq(work) expect(Rails.cache.read(Work.find_by_url_cache_key(url))).to eq(work) diff --git a/test/fixtures/imported_urls.yml b/test/fixtures/imported_urls.yml new file mode 100644 index 00000000000..5e08485005f --- /dev/null +++ b/test/fixtures/imported_urls.yml @@ -0,0 +1,15 @@ +--- +imported_url_00001: + created_at: 2008-12-06 22:56:52 Z + updated_at: 2008-12-06 22:56:52 Z + id: 1 + work_id: 82 + original: http://www.trickster.org/llwyden/misc/cracked.html + minimal: + minimal_no_protocol_no_www: + no_www: + with_www: + with_http: + with_https: + encoded: + decoded: