-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathScraper.py
More file actions
122 lines (103 loc) · 3.67 KB
/
Scraper.py
File metadata and controls
122 lines (103 loc) · 3.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#! /usr/bin/python
# Web Scraper for Pastebin.
# Just executes that he downloads the last posts and save in text mode on your PC.
# Author: w00f
import platform
import os
import sys
import time
import re
import requests
from bs4 import BeautifulSoup
def ac_sites(link_list):
if not os.path.exists(link_list):
file = open(link_list, 'w')
file.write('Accessed links\n\n')
try:
global cont
cont = file.read()
except IOError:
ac_sites(link_list)
file.close()
return cont
else:
file = open(link_list)
cont = file.read()
file.close()
return cont
def check_directory(arg):
if os.path.isdir('Pastebin/'+arg):
print("Starting Download")
else:
os.makedirs('Pastebin/'+arg)
print("Creating directory and starting download")
def get_links(soup):
table = soup.find('table')
links = []
for link in table.find_all('a'):
if link.get('href') not in accessed_sites and not re.search(
r'\b[a-zA-Z0-9]{8}\b', link.get('href'), re.IGNORECASE) == None and len(link.get('href')) == 9:
print('[+] New link ', link.get('href'))
links.append((link.get('href')))
return links
def download_contents(link,arg):
print('[+] Downloading http://pastebin.com'+link+'.txt')
site = requests.get('http://pastebin.com'+link)
pastebin_download = BeautifulSoup(site.text, 'html.parser')
title = pastebin_download.find('h1').string
pastebin_content = pastebin_download.find('textarea')
if title == 'Untitled':
title = content
try:
pastebin = open(os.path.join
('Pastebin/'+arg, os.path.basename(title))+'.txt', 'w', encoding='utf-8')
except:
pastebin = open(os.path.join
('Pastebin/'+arg, os.path.basename('Untitled'))+'.txt', 'w', encoding='utf-8')
contents = pastebin_content.string
pastebin.write(contents)
pastebin.close()
def saving_links(text_name, link_list):
file = open(text_name, 'a')
print("[+] Saving accessed links")
for i in link_list:
file.write(i+'\n')
file.close()
try:
print('[+] Downloading Trends')
accessed_sites = ac_sites('Trends_Accessed_Links.txt')
pastebin_latest = requests.get('http://pastebin.com/trends')
pastebin_soup = BeautifulSoup(pastebin_latest.text, 'html.parser')
print('[+] Getting links')
latest_links = get_links(pastebin_soup)
print('\n')
check_directory('TrendContents')
print('[+] Downloading trends content...')
for content in latest_links:
download_contents(content, 'TrendContents')
print('\n')
print('[+] Saving Links')
saving_links('Trends_Accessed_Links.txt', latest_links)
print('\n\n')
while True:
accessed_sites = ac_sites('Last_Accessed_Links.txt')
pastebin_latest = requests.get('http://pastebin.com/archive')
pastebin_soup = BeautifulSoup(pastebin_latest.text, 'html.parser')
latest_links = get_links(pastebin_soup)
print('\n')
check_directory('LastContent')
for content in latest_links:
download_contents(content, 'LastContent')
print('\n')
saving_links('Last_Accessed_Links.txt', latest_links)
print('Let\'s return in a minute.')
print('When you want to stop, use CTRL-C')
time.sleep(60)
if platform.system() == 'Linux' or platform.system() == 'Darwin':
os.system('clear')
elif platform.system() == 'Windows':
os.system('cls')
else:
print('\n'*50)
except KeyboardInterrupt:
print("\nProgram finished, made by w00f.")