Issue
How can I get the scraped information to be put into a csv file and then close the tab and write it into a new one and loop it through until all of the pages in the forum has been scraped I'm still learning more about web scraping and I'm completely stuck on this the div name that needs to be scraped is "post-content" but it doesn't show the correct information when I'm testing it
import driver as driver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common import window
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import csv
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)
options.add_argument("start-maximized")
wait = WebDriverWait(driver, 100)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get("https://navalcommand.enjin.com/forum/viewforum/2989694/m/11178354/page/1")
elems = driver.find_elements(By.XPATH, "//table[@class='structure small-cells']//a[@href]")
links = []
# create csv file
f = open(r"C:\Users\jammi\OneDrive\Desktop\Navcom\test.csv", 'w', encoding='UTF8')
csvWriter = csv.writer(f)
# to open every thread link
for ele in elems:
if "viewthread" in ele.get_attribute("href"):
links.append(ele.get_attribute("href"))
links = list(dict.fromkeys(links))
print(elems)
# to open every link into a new tab
for link in links:
driver.switch_to.new_window(window.WindowTypes.TAB)
driver.get(link)
# write the scraped information to a csv file
content = driver.find_elements(By.CLASS_NAME, "post-content")
print(content)
csvWriter.writerow([content])
Solution
Here is one possible solution:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
# disable chromedriver log message in cmd
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 10)
driver.get("https://navalcommand.enjin.com/forum/viewforum/2989694/m/11178354/page/1")
# get number of pages
num_pages = driver.find_element(By.CSS_SELECTOR, "span.text.rightmost").text.split(' ')[1]
for page in range(2, int(num_pages)):
# find all threads on the current page
threads = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.thread-view.thread-subject")))
# get links to threads
thread_links = [x.get_attribute('href') for x in threads]
# open each link and get all the posts in thread
for link in thread_links:
driver.get(link)
thread_content = driver.find_elements(By.CSS_SELECTOR, "div.post-content")
# get thread id
thread_id = driver.current_url.split('d/')[1].split('-')[0]
# save received data in csv
for post in thread_content:
post_content = post.text or post.find_element(By.TAG_NAME, 'img').get_attribute('src')
with open(file=f'{thread_id}_navalcommand.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([post_content])
driver.get(f"https://navalcommand.enjin.com/forum/viewforum/2989694/m/11178354/page/{page}")
driver.quit()
Output is list of csv files:
32694465_navalcommand.csv
33053469_navalcommand.csv
33079839_navalcommand.csv
Each file is a separate thread
Answered By - Lenta
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.