forked from Smerity/cc-quick-scripts
-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathcount_responses.sh
More file actions
executable file
·23 lines (23 loc) · 1.01 KB
/
count_responses.sh
File metadata and controls
executable file
·23 lines (23 loc) · 1.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#!/bin/bash
# Saves total response count to responses folder
# Returns total byte count to size folder
# Naive testing showed wc --bytes faster than dd or pv (with pv not being 100% accurate)
# Example usage: shuf ~/warc_list.txt | parallel --no-notice --bar -P 64 ./count_responses.sh {}
#
mkdir -p responses
mkdir -p size
###
# Get the data from S3 and send it to stdout, where zcat decompresses it
s3cmd get s3://commoncrawl/$1 - | zcat | \
# send the data two places, first to count how many bytes and record that
tee >(wc --bytes > size/`basename $1`) | \
# second to find all response results and count the number of times that's seen
grep "WARC-Type: response" | wc -l > responses/`basename $1`
###
# Helpful:
# As there's no good error checking, errors sometimes pop up leaving empty or tiny results
# cat * | grep -E '[0-9]{6,10}' | wc -l
# cat * | grep -E '[0-9]{6,10}' | awk '{ SUM += $1} END { print SUM }'
###
# cat * | grep -E '[0-9]{4,10}' | wc -l
# cat * | grep -E '[0-9]{4,10}' | awk '{ SUM += $1} END { print SUM }'