Issue
This Content is from Stack Overflow. Question asked by Martin Vardanyan
I have more than 4000 urls and need to send get request there. So this is my script and code is stuck after 140 urls. How I can solve it?
I think the problem can be with requests lib, but I don’t have idea why.
def main():
driver = dryscrape.Session()
contacts = ['contact', 'Contact']
headers={
'Accept-Language' : 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
domains = get_data()
data = {}
for domain in tqdm(domains):
f = False
try:
page = requests.get('https://www.' + domain, headers=headers, verify=False, timeout=5)
except OSError:
try:
page = requests.get('https://www.' + domain, headers=headers, verify=False, timeout=5)
except OSError:
data[domain] = []
continue
home_page_text = page.text
true_link = get_links(page, contacts)
if true_link:
if domain not in true_link:
true_link = 'https://www.' + domain + true_link
try:
contact_page = requests.get(true_link, headers=headers, verify=False, timeout=5)
except OSError:
try:
contact_page = requests.get(true_link, headers=headers, verify=False, timeout=5)
except OSError:
data[domain] = []
continue
contact_page_text = contact_page.text
s = check_text(home_page_text)
if true_link:
s |= check_text(contact_page_text)
lst = []
for x in s:
if domain not in x:
f = True
if f:
if true_link:
try:
m = dryscrape_search(driver, true_link)
except:
m = ''
s |= check_text(m)
data[domain] = list(s|set(lst))
And this is the some function which I use in main function
#
def get_links(page, contacts):
webpage = html.fromstring(page.content)
links = set(webpage.xpath('//a/@href'))
for link in links:
for contact in contacts:
if contact in link:
true_link = link
return true_link
return False
#
def dryscrape_search(driver, true_link):
driver.visit(true_link)
return driver.body()
Solution
This question is not yet answered, be the first one who answer using the comment. Later the confirmed answer will be published as the solution.
This Question and Answer are collected from stackoverflow and tested by JTuto community, is licensed under the terms of CC BY-SA 2.5. - CC BY-SA 3.0. - CC BY-SA 4.0.