diff --git a/Gemfile.lock b/Gemfile.lock index bb0fd32ad2..e5c6a5d092 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -48,9 +48,10 @@ GIT GIT remote: https://github.com/MothOnMars/robotex - revision: e51e6085d21ba532aaa3d54e979c92090c842af4 + revision: 7e91000a3592e96ead47591e6c5c15233c19ab43 specs: robotex (1.0.0) + open_uri_redirections (~> 0.2.1) GIT remote: https://github.com/MothOnMars/sitemaps @@ -433,6 +434,7 @@ GEM nokogumbo (1.5.0) nokogiri oj (3.3.10) + open_uri_redirections (0.2.1) os (0.9.6) paperclip (5.2.1) activemodel (>= 4.2.0) diff --git a/app/models/searchgov_domain.rb b/app/models/searchgov_domain.rb index 33422cc5f0..71bf59eb2e 100644 --- a/app/models/searchgov_domain.rb +++ b/app/models/searchgov_domain.rb @@ -4,6 +4,13 @@ class SearchgovDomain < ActiveRecord::Base attr_readonly :domain + def delay + @delay ||= begin + robotex = Robotex.new 'usasearch' + robotex.delay("http://#{domain}/") || 1 + end + end + private def valid_domain? diff --git a/spec/models/searchgov_domain_spec.rb b/spec/models/searchgov_domain_spec.rb index d1af0f0a16..0c6688d0bc 100644 --- a/spec/models/searchgov_domain_spec.rb +++ b/spec/models/searchgov_domain_spec.rb @@ -65,4 +65,47 @@ end end end + + describe '#delay' do + subject(:delay) { searchgov_domain.delay } + + before do + stub_request(:get, "http://#{domain}/robots.txt"). + to_return(status: [200, "OK"], headers: { content_type: 'text/plain' }, body: robots) + end + + context 'when a delay is specified in robots.txt' do + let(:robots) { "User-agent: *\nCrawl-delay: 10" } + + it { is_expected.to eq 10 } + + context 'when the domain is redirected' do + before do + stub_request(:get, "http://#{domain}/robots.txt"). + to_return(status: 301, headers: { location: "https://#{domain}/robots.txt" }, body: "") + stub_request(:get, "https://#{domain}/robots.txt"). + to_return(status: [200, "OK"], headers: { content_type: 'text/plain' }, body: robots) + end + + it { is_expected.to eq 10 } + end + end + + context 'when no delay is specified' do + let(:robots) { "User-agent: *\nDisallow: /somedir/" } + + it 'defaults to 1' do + expect(delay).to eq 1 + end + end + + context 'when a delay is specified for the "usasearch" user agent' do + let(:robots) { "User-agent: *\nCrawl-delay: 10\nUser-agent: usasearch\nCrawl-delay: 2" } + + # This needs to be fixed in the robotex gem: + # https://github.com/chriskite/robotex/issues/9 + # https://www.pivotaltracker.com/story/show/157329443 + xit { is_expected.to eq 2 } + end + end end