From 2a179c4fb29542dcee3255c034899cffc240389c Mon Sep 17 00:00:00 2001 From: MothOnMars Date: Fri, 4 May 2018 09:33:23 -0700 Subject: [PATCH 1/2] [#157319228] implement SearchgovDomain#delay method --- app/models/searchgov_domain.rb | 7 ++++++ spec/models/searchgov_domain_spec.rb | 32 ++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/app/models/searchgov_domain.rb b/app/models/searchgov_domain.rb index 33422cc5f0..71bf59eb2e 100644 --- a/app/models/searchgov_domain.rb +++ b/app/models/searchgov_domain.rb @@ -4,6 +4,13 @@ class SearchgovDomain < ActiveRecord::Base attr_readonly :domain + def delay + @delay ||= begin + robotex = Robotex.new 'usasearch' + robotex.delay("http://#{domain}/") || 1 + end + end + private def valid_domain? diff --git a/spec/models/searchgov_domain_spec.rb b/spec/models/searchgov_domain_spec.rb index d1af0f0a16..c93ca13088 100644 --- a/spec/models/searchgov_domain_spec.rb +++ b/spec/models/searchgov_domain_spec.rb @@ -65,4 +65,36 @@ end end end + + describe '#delay' do + subject(:delay) { searchgov_domain.delay } + + before do + stub_request(:get, "http://#{domain}/robots.txt"). + to_return(status: [200, "OK"], headers: { content_type: 'text/plain' }, body: robots) + end + + context 'when a delay is specified in robots.txt' do + let(:robots) { "User-agent: *\nCrawl-delay: 10" } + + it { is_expected.to eq 10 } + end + + context 'when no delay is specified' do + let(:robots) { "User-agent: *\nDisallow: /somedir/" } + + it 'defaults to 1' do + expect(delay).to eq 1 + end + end + + context 'when a delay is specified for the "usasearch" user agent' do + let(:robots) { "User-agent: *\nCrawl-delay: 10\nUser-agent: usasearch\nCrawl-delay: 2" } + + # This needs to be fixed in the robotex gem: + # https://github.com/chriskite/robotex/issues/9 + # https://www.pivotaltracker.com/story/show/157329443 + xit { is_expected.to eq 2 } + end + end end From 2d245b07920716ddfcf42e0fd75f365ce12f6490 Mon Sep 17 00:00:00 2001 From: MothOnMars Date: Mon, 7 May 2018 12:24:58 -0700 Subject: [PATCH 2/2] squashme: add spec, bump robotex --- Gemfile.lock | 4 +++- spec/models/searchgov_domain_spec.rb | 11 +++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index bb0fd32ad2..e5c6a5d092 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -48,9 +48,10 @@ GIT GIT remote: https://github.com/MothOnMars/robotex - revision: e51e6085d21ba532aaa3d54e979c92090c842af4 + revision: 7e91000a3592e96ead47591e6c5c15233c19ab43 specs: robotex (1.0.0) + open_uri_redirections (~> 0.2.1) GIT remote: https://github.com/MothOnMars/sitemaps @@ -433,6 +434,7 @@ GEM nokogumbo (1.5.0) nokogiri oj (3.3.10) + open_uri_redirections (0.2.1) os (0.9.6) paperclip (5.2.1) activemodel (>= 4.2.0) diff --git a/spec/models/searchgov_domain_spec.rb b/spec/models/searchgov_domain_spec.rb index c93ca13088..0c6688d0bc 100644 --- a/spec/models/searchgov_domain_spec.rb +++ b/spec/models/searchgov_domain_spec.rb @@ -78,6 +78,17 @@ let(:robots) { "User-agent: *\nCrawl-delay: 10" } it { is_expected.to eq 10 } + + context 'when the domain is redirected' do + before do + stub_request(:get, "http://#{domain}/robots.txt"). + to_return(status: 301, headers: { location: "https://#{domain}/robots.txt" }, body: "") + stub_request(:get, "https://#{domain}/robots.txt"). + to_return(status: [200, "OK"], headers: { content_type: 'text/plain' }, body: robots) + end + + it { is_expected.to eq 10 } + end end context 'when no delay is specified' do