-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreddit_python_scraper.py
120 lines (77 loc) · 3.77 KB
/
reddit_python_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import json
import requests
from bs4 import BeautifulSoup
url = "https://scrape.smartproxy.com/v1/tasks"
payload = {
"target": "universal",
"url": "https://www.reddit.com/r/aww/comments/11xq5ew/yesterday_i_posted_how_i_was_going_to_adopt/",
"headless": "html",
"parse": "false"
}
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": "Basic AUTH"
}
def main():
response = requests.post(url, json=payload, headers=headers)
json_data = response.text
parsed_data = json.loads(json_data)
content = parsed_data['results'][0]['content']
# Strip scraped content from backslashes
stripped_content = content.replace('\\', '')
htmlopen = open("test.html", "w")
htmlopen.write(stripped_content)
htmlopen.close()
soup = BeautifulSoup(stripped_content, "html.parser")
data = []
# Select data points
username = soup.find_all('a', class_='_2tbHP6ZydRpjI44J3syuqC _23wugcdiaj44hdfugIAlnX oQctV4n0yUb0uiHDdGnmE')
post_timestamp = soup.find_all('span', class_='_2VF2J19pUIMSLJFky-7PEI')
post_title = soup.find_all('h1', class_='_eYtD2XCVieq6emjKBH3m')
comment_count = soup.find_all('span', class_='FHCV02u6Cp2zYL0fhQPsO')
upvote_percentage = soup.find_all('div', class_='t4Hq30BDzTeJ85vREX7_M')
subreddit_description = soup.find_all('div', class_='_1zPvgKHteTOub9dKkvrOl4')
subreddit_name = soup.find_all('span', class_='_19bCWnxeTjqzBElWZfIlJb')
subreddit_date = soup.find_all('span', class_='_1d4NeAxWOiy0JPz7aXRI64')
subreddit_members = soup.find_all('div', class_='_3b9utyKN3e_kzVZ5ngPqAu')
subreddit_members_online = soup.find_all('div', class_='_21RLQh5PvUhC6vOKoFeHUP')
div_tags = soup.find_all('div', class_='_3tw__eCCe7j-epNCKGXUKk')
post = {
"Username": username[0].text,
"PostedAt": post_timestamp[0].text,
"PostTitle": post_title[0].text,
"CommentCount": comment_count[0].text,
"UpvotePercentage": upvote_percentage[0].text,
"SubredditDescription": subreddit_description[0].text,
"SubredditName": subreddit_name[0].text,
"SubredditCreated": subreddit_date[0].text,
"SubredditMembers": subreddit_members[0].text,
"SubredditMembersOnline":subreddit_members_online[0].text
}
data.append(post)
# Extract data points
for div_tag in div_tags:
author_tags = div_tag.find_all('a', class_='wM6scouPXXsFDSZmZPHRo DjcdNGtVXPcxG0yiFXIoZ _23wugcdiaj44hdfugIAlnX')
author_text = [author_tag.text for author_tag in author_tags]
comment_timestamp_tags = div_tag.find_all('a', class_='_3yx4Dn0W3Yunucf5sVJeFU')
comment_timestamp_text = [comment_timestamp_tag.text for comment_timestamp_tag in comment_timestamp_tags]
comment_url_tags = div_tag.find_all('a', class_='_3yx4Dn0W3Yunucf5sVJeFU', href=True)
comment_url_text = [comment_url_tag['href'] for comment_url_tag in comment_url_tags]
comment_text_tags = div_tag.find_all('p', class_='_1qeIAgB0cPwnLhDF9XSiJM')
comment_text_text = [comment_text_tag.text for comment_text_tag in comment_text_tags]
comment_upvotes_tags = div_tag.find_all('div', class_='_1rZYMD_4xY3gRcSS3p8ODO _25IkBM0rRUqWX5ZojEMAFQ _3ChHiOyYyUkpZ_Nm3ZyM2M')
comment_upvotes_text = [comment_upvotes_tag.text for comment_upvotes_tag in comment_upvotes_tags]
element = {
'CommentAuthorName': author_text,
'CommentDate': comment_timestamp_text,
'CommentURL': comment_url_text,
'CommentText': comment_text_text,
'CommentUpvotes': comment_upvotes_text
}
data.append(element)
# Save data to JSON
with open('data.json', 'w') as f:
json.dump(data, f)
if __name__ == "__main__":
main()