forked from jusk9527/ArticleSpider
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
106 lines (84 loc) · 3.51 KB
/
scraper.py
File metadata and controls
106 lines (84 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from selenium import webdriver
import time
import argparse
import csv
# parser = argparse.ArgumentParser(description='Non API public FB miner')
#
# parser.add_argument('--pages', nargs='+',
# dest="pages",
# help="List the pages you want to scrape for recent posts")
#
# parser.add_argument("-d", "--depth", action="store",
# dest="depth", default=5, type=int,
# help="How many recent posts you want to gather -- in multiples of (roughly) 8.")
#
# args = parser.parse_args()
class Collector(object):
"""Collector of recent FaceBook posts.
Note: We bypass the FaceBook-Graph-API by using a
selenium FireFox instance!
This is against the FB guide lines and thus not allowed.
USE THIS FOR EDUCATIONAL PURPOSES ONLY. DO NOT ACTAULLY RUN IT.
"""
def __init__(self, pages=["oxfess"], corpus_file="posts.csv", depth=5, delay=2):
super(Collector, self).__init__()
self.pages = pages
self.dump = corpus_file
self.depth = depth + 1
self.delay = delay
# browser instance
# self.browser = webdriver.Firefox()
self.browser = webdriver.Chrome(
executable_path="/Users/xiao/PycharmProjects/ArticleSpider/chromdriver2.33/chromedriver")
# creating CSV header
with open(self.dump, "w", newline='', encoding="utf-8") as save_file:
writer = csv.writer(save_file)
writer.writerow(["Source", "utime", "Text"])
def strip(self, string):
"""Helping function to remove all non alphanumeric characters"""
words = string.split()
words = [word for word in words if "#" not in word]
string = " ".join(words)
clean = ""
for c in string:
if str.isalnum(c) or (c in [" ", ".", ","]):
clean += c
return clean
def collect_page(self, page):
# navigate to page
self.browser.get('https://www.facebook.com/' + page + '/')
# Scroll down depth-times and wait delay seconds to load
# between scrolls
for scroll in range(self.depth):
# Scroll down to bottom
self.browser.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(self.delay)
# Once the full page is loaded, we can start scraping
with open(self.dump, "a+", newline='', encoding="utf-8") as save_file:
writer = csv.writer(save_file)
posts = self.browser.find_elements_by_class_name(
"userContentWrapper")
for post in posts:
# Creating first CSV row entry with the page name (eg. "DonaldTrump")
analysis = [page]
# Creating utime entry.
time_element = post.find_element_by_css_selector(
"abbr")
utime = time_element.get_attribute("data-utime")
analysis.append(utime)
# Creating post text entry
text = ""
text_elements = post.find_elements_by_css_selector(
"p")
for p in text_elements:
text += self.strip(p.text)
analysis.append(text)
# Write row to csv
writer.writerow(analysis)
def collect(self):
for page in self.pages:
self.collect_page(page)
C = Collector(pages=["DonaldTrump"], depth=1)
C.collect()