-
Notifications
You must be signed in to change notification settings - Fork 1
/
PnumExtrator.py
63 lines (47 loc) · 1.79 KB
/
PnumExtrator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from bs4 import BeautifulSoup
from urllib.request import urlopen
from openpyxl import load_workbook
import re
filepath = ("test.xlsx") # This is file location.
wb = load_workbook(filepath) # Load workbook and find active sheet.
sheet = wb.active
i = 1 # This is line number of the first url existing.
while i > 0:
# Take urls one by one
i = i + 1
url = sheet.cell(i,1).value
if url == None: # If url doesn't exist, iteration is finished
break
url = ('http://' + url, url) ['http://' in url] # remake url
print(url)
# Take urls one by one. If request state is not 200 (success), then go to the next url
try:
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
innerhtml = soup.select("p")
txt = ''
for idx in innerhtml: # parse innerhtml
txt += '\n' + idx.text
phones = re.findall(r'\(?[0-9]{3}\)?[ .-]?[0-9]{3}[ .-]?[0-9]{4}', txt) # parse phone numbers from the innerhtml
# If on phone numbers in index page, then parse in contact page
if phones == []:
url += '/contact'
print(url)
try:
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
innerhtml = soup.select("p")
txt = ''
for idx in innerhtml:
txt += '\n' + idx.text
phones = re.findall(r'\(?[0-9]{3}\)?[ .-]?[0-9]{3}[ .-]?[0-9]{4}', txt)
except:
continue
except:
continue
if (phones == []):
continue
phones = ', '.join(list(dict.fromkeys(phones))) # remove duplicate phone numbers
sheet.cell(i, 12).value = phones
print(phones)
wb.save(filepath)