[SOLVED] How to handle pagination if only javascript can be loaded in BeautifulSoup Python

Issue

This Content is from Stack Overflow. Question asked by theycallmepix

It’s been hours since I am stuck here.
The entire code has been uploaded below.
The url: “https://www.bostonpublicschools.org/Page/628” is of only 1 page. However, it has 5 pages within itself that can be loaded by clicking on the page-icon at the bottom. This loads a javascript file that cannot be opened on a new tab (trying to open results in page getting blocked). Also, it loads on the same page.

I’ve so far looked at more than 100 solutions and tried many of them and none of them worked for me. Also, I do believe that this can be solved by using selenium as it can interact with the UI.

Is there any way I can get the job done within BeautifulSoup?

from bs4 import BeautifulSoup as bs
import requests
import re

valid = False


def URL_Parser(url):
    # Desktop user-agent
    DESKTOP_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
    # Mobile user-agent
    MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"

    headers = {"user-agent": MOBILE_USER_AGENT}
    resp = requests.get(url, headers=headers)
    return resp


def validate_mail(e_mail):
    return bool(re.search(r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+$", e_mail))


def main():
    global valid
    URL = "https://www.bostonpublicschools.org/Page/628"
    response = URL_Parser(URL)

    if response.status_code == 200:
        soup = bs(response.content, "html.parser")

        school_database = {"School Name": [],
                           "School Email ID": [],
                           "School Link": []
                           }

        links = []

        for link in soup.findAll('a', attrs={'href': re.compile("^https://www.bostonpublicschools.org/Page/")}):
            if link.get('aria-invalid'):
                links.append(link.get('href'))

        for link in links:
            school = bs(URL_Parser(link).content, 'html.parser')

            school_name = school.find('div', {'class': "ui-widget app flexpage"}).find(class_="ui-widget-header").select('h1')[0].text.strip()
            try:
                school_email = school.find('div', {'class': "ui-column-one-half region"}).find(class_="ui-article-description").select('p')[2].text.strip()
                valid = validate_mail(school_email)
                if not valid:
                    school_email = school.find('div', {'class': "ui-column-one-half region"}).find(class_="ui-article-description").select('p')[1].text.strip()
                    valid = validate_mail(school_email)
                    if not valid:
                        school_email = school.find('div', {'class': "ui-column-one-half region"}).find(class_="ui-article-description").select('p')[3].text.strip()
                        valid = validate_mail(school_email)

            except IndexError:
                print(">>> Email is not in expected place!")
            else:
                school_database["School Name"].append(school_name)
                school_database["School Email ID"].append(school_email)
                school_database["School Link"].append(link)

        print(school_database["School Link"])
        print(school_database["School Name"])
        print(school_database["School Email ID"])


if '__init__' == main():
    main()



Solution

Try:

import re
import requests
from bs4 import BeautifulSoup

url = "https://www.bostonpublicschools.org/Page/628"

# get required parameters:
html_doc = requests.get(url).text
soup = BeautifulSoup(html_doc, "html.parser")
url = re.search(r"window\.open\((.*)\);", html_doc).group(1)
ModuleInstanceID = re.search(r"ModuleInstanceID=(\d+)", url).group(1)
PageModuleInstanceID = re.search(r"PageModuleInstanceID=(\d+)", url).group(1)
DirectoryType = soup.select_one("[id$=displaytype]")["value"]
total_pages = int(soup.select(".ui-pagination-list li")[-1].text)

url = "https://www.bostonpublicschools.org/site/UserControls/Minibase/MinibaseListWrapper.aspx"

params = {
    "ModuleInstanceID": ModuleInstanceID,
    "PageModuleInstanceID": PageModuleInstanceID,
    "FilterFields": "",
    "DirectoryType": DirectoryType,
    "PageIndex": "",
}

# iterate over the pages and print sample info:
for params["PageIndex"] in range(1, total_pages + 1):
    soup = BeautifulSoup(
        requests.get(url, params=params).content, "html.parser"
    )

    for a in soup.select(".sw-flex-item-group a"):
        print(a.text)

Prints:


...

UP Academy Holland
Warren/Prescott K-8
West Zone Early Learning Center
Winship, F. Lyman Elementary
Winthrop, John Elementary
Young Achievers Science & Math K-8


This Question was asked in StackOverflow by theycallmepix and Answered by Andrej Kesely It is licensed under the terms of CC BY-SA 2.5. - CC BY-SA 3.0. - CC BY-SA 4.0.

people found this article helpful. What about you?