Issue
This Content is from Stack Overflow. Question asked by theycallmepix
It’s been hours since I am stuck here.
The entire code has been uploaded below.
The url: “https://www.bostonpublicschools.org/Page/628” is of only 1 page. However, it has 5 pages within itself that can be loaded by clicking on the page-icon at the bottom. This loads a javascript file that cannot be opened on a new tab (trying to open results in page getting blocked). Also, it loads on the same page.
I’ve so far looked at more than 100 solutions and tried many of them and none of them worked for me. Also, I do believe that this can be solved by using selenium
as it can interact with the UI.
Is there any way I can get the job done within BeautifulSoup?
from bs4 import BeautifulSoup as bs
import requests
import re
valid = False
def URL_Parser(url):
# Desktop user-agent
DESKTOP_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
# Mobile user-agent
MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
headers = {"user-agent": MOBILE_USER_AGENT}
resp = requests.get(url, headers=headers)
return resp
def validate_mail(e_mail):
return bool(re.search(r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+$", e_mail))
def main():
global valid
URL = "https://www.bostonpublicschools.org/Page/628"
response = URL_Parser(URL)
if response.status_code == 200:
soup = bs(response.content, "html.parser")
school_database = {"School Name": [],
"School Email ID": [],
"School Link": []
}
links = []
for link in soup.findAll('a', attrs={'href': re.compile("^https://www.bostonpublicschools.org/Page/")}):
if link.get('aria-invalid'):
links.append(link.get('href'))
for link in links:
school = bs(URL_Parser(link).content, 'html.parser')
school_name = school.find('div', {'class': "ui-widget app flexpage"}).find(class_="ui-widget-header").select('h1')[0].text.strip()
try:
school_email = school.find('div', {'class': "ui-column-one-half region"}).find(class_="ui-article-description").select('p')[2].text.strip()
valid = validate_mail(school_email)
if not valid:
school_email = school.find('div', {'class': "ui-column-one-half region"}).find(class_="ui-article-description").select('p')[1].text.strip()
valid = validate_mail(school_email)
if not valid:
school_email = school.find('div', {'class': "ui-column-one-half region"}).find(class_="ui-article-description").select('p')[3].text.strip()
valid = validate_mail(school_email)
except IndexError:
print(">>> Email is not in expected place!")
else:
school_database["School Name"].append(school_name)
school_database["School Email ID"].append(school_email)
school_database["School Link"].append(link)
print(school_database["School Link"])
print(school_database["School Name"])
print(school_database["School Email ID"])
if '__init__' == main():
main()
Solution
Try:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.bostonpublicschools.org/Page/628"
# get required parameters:
html_doc = requests.get(url).text
soup = BeautifulSoup(html_doc, "html.parser")
url = re.search(r"window\.open\((.*)\);", html_doc).group(1)
ModuleInstanceID = re.search(r"ModuleInstanceID=(\d+)", url).group(1)
PageModuleInstanceID = re.search(r"PageModuleInstanceID=(\d+)", url).group(1)
DirectoryType = soup.select_one("[id$=displaytype]")["value"]
total_pages = int(soup.select(".ui-pagination-list li")[-1].text)
url = "https://www.bostonpublicschools.org/site/UserControls/Minibase/MinibaseListWrapper.aspx"
params = {
"ModuleInstanceID": ModuleInstanceID,
"PageModuleInstanceID": PageModuleInstanceID,
"FilterFields": "",
"DirectoryType": DirectoryType,
"PageIndex": "",
}
# iterate over the pages and print sample info:
for params["PageIndex"] in range(1, total_pages + 1):
soup = BeautifulSoup(
requests.get(url, params=params).content, "html.parser"
)
for a in soup.select(".sw-flex-item-group a"):
print(a.text)
Prints:
...
UP Academy Holland
Warren/Prescott K-8
West Zone Early Learning Center
Winship, F. Lyman Elementary
Winthrop, John Elementary
Young Achievers Science & Math K-8
This Question was asked in StackOverflow by theycallmepix and Answered by Andrej Kesely It is licensed under the terms of CC BY-SA 2.5. - CC BY-SA 3.0. - CC BY-SA 4.0.