PastebinScraper/Scraper.py at master · w00fx/PastebinScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#! /usr/bin/python

# Web Scraper for Pastebin.
# Just executes that he downloads the last posts and save in text mode on your PC.
# Author: w00f

import platform
import os
import sys
import time
import re
import requests
from bs4 import BeautifulSoup

def ac_sites(link_list):
    if not os.path.exists(link_list):
        file = open(link_list, 'w')
        file.write('Accessed links\n\n')
        try:
            global cont
            cont = file.read()
        except IOError:
            ac_sites(link_list)
        file.close()
        return cont
    else:
        file = open(link_list)
        cont = file.read()
        file.close()
        return cont


def check_directory(arg):
    if os.path.isdir('Pastebin/'+arg):
        print("Starting Download")
    else:
        os.makedirs('Pastebin/'+arg)
        print("Creating directory and starting download")


def get_links(soup):
    table = soup.find('table')
    links = []
    for link in table.find_all('a'):
        if link.get('href') not in accessed_sites and not re.search(
                r'\b[a-zA-Z0-9]{8}\b', link.get('href'), re.IGNORECASE) == None and len(link.get('href')) == 9:
            print('[+] New link ', link.get('href'))
            links.append((link.get('href')))
    return links


def download_contents(link,arg):
    print('[+] Downloading http://pastebin.com'+link+'.txt')
    site = requests.get('http://pastebin.com'+link)
    pastebin_download = BeautifulSoup(site.text, 'html.parser')
    title = pastebin_download.find('h1').string
    pastebin_content = pastebin_download.find('textarea')
    if title == 'Untitled':
        title = content
    try:
        pastebin = open(os.path.join
                        ('Pastebin/'+arg, os.path.basename(title))+'.txt', 'w', encoding='utf-8')
    except:
        pastebin = open(os.path.join
                        ('Pastebin/'+arg, os.path.basename('Untitled'))+'.txt', 'w', encoding='utf-8')
    contents = pastebin_content.string
    pastebin.write(contents)
    pastebin.close()


def saving_links(text_name, link_list):
    file = open(text_name, 'a')
    print("[+] Saving accessed links")
    for i in link_list:
        file.write(i+'\n')
    file.close()


try:
    print('[+] Downloading Trends')
    accessed_sites = ac_sites('Trends_Accessed_Links.txt')
    pastebin_latest = requests.get('http://pastebin.com/trends')
    pastebin_soup = BeautifulSoup(pastebin_latest.text, 'html.parser')
    print('[+] Getting links')
    latest_links = get_links(pastebin_soup)
    print('\n')
    check_directory('TrendContents')
    print('[+] Downloading trends content...')
    for content in latest_links:
        download_contents(content, 'TrendContents')
    print('\n')
    print('[+] Saving Links')
    saving_links('Trends_Accessed_Links.txt', latest_links)
    print('\n\n')

    while True:
        accessed_sites = ac_sites('Last_Accessed_Links.txt')
        pastebin_latest = requests.get('http://pastebin.com/archive')
        pastebin_soup = BeautifulSoup(pastebin_latest.text, 'html.parser')
        latest_links = get_links(pastebin_soup)

        print('\n')
        check_directory('LastContent')

        for content in latest_links:
            download_contents(content, 'LastContent')

        print('\n')
        saving_links('Last_Accessed_Links.txt', latest_links)

        print('Let\'s return in a minute.')
        print('When you want to stop, use CTRL-C')
        time.sleep(60)
        if platform.system() == 'Linux' or platform.system() == 'Darwin':
            os.system('clear')
        elif platform.system() == 'Windows':
            os.system('cls')
        else:
            print('\n'*50)

except KeyboardInterrupt:
    print("\nProgram finished, made by w00f.")