Downloading Event Data

Sometimes a page with information you want to save is so long, with up to 8000 items to scroll through, that you need to automate the process. Here is an example of how a webdriver can be used for such a simple task that is difficult to do manually. So this list could be now parsed filtered and used in Automated LinkedIn Connection Invitations or somthing like this.

from tinydb import TinyDB, Query
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
from selenium import webdriver
import chromedriver_binary # Adds chromedriver binary to path

from bs4 import BeautifulSoup

# https://jaarbeurszakelijk.app.swapcard.com/event/XXX/people/
options = Options()
#options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("window-size=1920,1080")

driver = webdriver.Chrome(options=options)

driver.get("https://jaarbeurszakelijk.app.swapcard.com/event/XXX/people/")
# Open url
# Login is niet nodig want magic link
import pdb;pdb.set_trace()

while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
scroll_height = driver.execute_script("return document.body.scrollHeight")
scroll_position = driver.execute_script("return window.pageYOffset + window.innerHeight")
# Check if the scroll position is at the bottom
if scroll_position >= scroll_height:
import pdb;pdb.set_trace()
page_content = driver.page_source
soup = BeautifulSoup(page_content, 'html.parser')
ahrefs = soup.find_all('a', href=True)
len(ahrefs) # 8481
file_html = open("full_page_with_names.html", "w")
file_html.write(page_content)
file_html.close()

Search This Blog

Downloading Event Data

Comments

Post a Comment