ArticleSpider/scraper.py at master · data-python/ArticleSpider · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from selenium import webdriver
import time
import argparse
import csv

# parser = argparse.ArgumentParser(description='Non API public FB miner')
#
# parser.add_argument('--pages', nargs='+',
#                     dest="pages",
#                     help="List the pages you want to scrape for recent posts")
#
# parser.add_argument("-d", "--depth", action="store",
#                     dest="depth", default=5, type=int,
#                     help="How many recent posts you want to gather -- in multiples of (roughly) 8.")
#
# args = parser.parse_args()


class Collector(object):
    """Collector of recent FaceBook posts.
           Note: We bypass the FaceBook-Graph-API by using a
           selenium FireFox instance!
           This is against the FB guide lines and thus not allowed.

           USE THIS FOR EDUCATIONAL PURPOSES ONLY. DO NOT ACTAULLY RUN IT.
    """

    def __init__(self, pages=["oxfess"], corpus_file="posts.csv", depth=5, delay=2):
        super(Collector, self).__init__()
        self.pages = pages
        self.dump = corpus_file
        self.depth = depth + 1
        self.delay = delay
        # browser instance
        # self.browser = webdriver.Firefox()
        self.browser = webdriver.Chrome(
            executable_path="/Users/xiao/PycharmProjects/ArticleSpider/chromdriver2.33/chromedriver")

        # creating CSV header
        with open(self.dump, "w", newline='', encoding="utf-8") as save_file:
            writer = csv.writer(save_file)
            writer.writerow(["Source", "utime", "Text"])

    def strip(self, string):
        """Helping function to remove all non alphanumeric characters"""
        words = string.split()
        words = [word for word in words if "#" not in word]
        string = " ".join(words)
        clean = ""
        for c in string:
            if str.isalnum(c) or (c in [" ", ".", ","]):
                clean += c
        return clean

    def collect_page(self, page):
        # navigate to page
        self.browser.get('https://www.facebook.com/' + page + '/')

        # Scroll down depth-times and wait delay seconds to load
        # between scrolls
        for scroll in range(self.depth):

            # Scroll down to bottom
            self.browser.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")

            # Wait to load page
            time.sleep(self.delay)

        # Once the full page is loaded, we can start scraping
        with open(self.dump, "a+", newline='', encoding="utf-8") as save_file:
            writer = csv.writer(save_file)
            posts = self.browser.find_elements_by_class_name(
                "userContentWrapper")

            for post in posts:

                # Creating first CSV row entry with the page name (eg. "DonaldTrump")
                analysis = [page]

                # Creating utime entry.
                time_element = post.find_element_by_css_selector(
                    "abbr")
                utime = time_element.get_attribute("data-utime")
                analysis.append(utime)

                # Creating post text entry
                text = ""
                text_elements = post.find_elements_by_css_selector(
                    "p")
                for p in text_elements:
                    text += self.strip(p.text)
                analysis.append(text)

                # Write row to csv
                writer.writerow(analysis)

    def collect(self):
        for page in self.pages:
            self.collect_page(page)


C = Collector(pages=["DonaldTrump"], depth=1)

C.collect()