forked from aadityakushwaha/DWCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.py
71 lines (60 loc) · 2.72 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import requests
from bs4 import BeautifulSoup
import mysql.connector
# Connect to MySQL database
mydb = mysql.connector.connect(
host="34.220.243.94",
port = "3306",
user="root",
password="Girlactor@77",
database="Crawler"
)
mycursor = mydb.cursor()
# Get list of onion URLs from database that haven't been scraped yet
mycursor.execute("SELECT id, url FROM onion_urls WHERE scraped = 0")
onion_urls = mycursor.fetchall()
# Loop through onion URLs and scrape meta content, image URLs, and other useful data
for id, url in onion_urls:
try:
# Make request to onion URL and parse HTML with BeautifulSoup
response = requests.get(url, proxies={'http': 'socks5h://localhost:9050', 'https': 'socks5h://localhost:9050', }, timeout=10)
soup = BeautifulSoup(response.content, 'html.parser')
# Get title
title = soup.find('title').text.strip() if soup.find('title') else ""
# Get keywords
keywords = ""
meta_tags = soup.find_all('meta')
for tag in meta_tags:
if tag.get('name') == 'keywords':
keywords += tag.get('content')
# Get description
description = ""
for tag in meta_tags:
if tag.get('name') == 'description':
description += tag.get('content')
# Get content
content = ""
body_tags = soup.find_all('body')
for tag in body_tags:
content += tag.text.strip()
# Get image URLs
image_urls = ""
img_tags = soup.find_all('img')
for tag in img_tags:
if tag.get('src'):
image_urls += tag.get('src') + "\n"
try:
# Update record in database with title, keywords, description, content, and image URLs, and mark as scraped
sql = "UPDATE onion_urls SET url = %s, title = %s, keywords = %s, description = %s, content = %s, image_urls = %s, scraped = 1 WHERE id = %s and scraped = 0"
val = (url.strip(), title.strip(), keywords.strip(), description.strip(), content.strip(), image_urls.strip(), id)
mycursor.execute(sql, val)
mydb.commit()
print(f"Successfully scraped and updated data for {url}")
except Exception as e:
print(f"Error saaving data to database: {e}")
sql = "UPDATE onion_urls SET url = %s, title = %s, keywords = %s, description = %s, content = %s, image_urls = %s, scraped = -1 WHERE id = %s and scraped <> 1"
val = (url.strip(), title.strip(), keywords.strip(), description.strip(), content.strip(), image_urls.strip(), id)
mycursor.execute(sql, val)
mydb.commit()
except Exception as e:
print(f"Some error occured: {e}")