mirror of
https://git.zavage.net/Zavage-Software/wikicrawl.git
synced 2025-01-18 03:18:44 -07:00
cli and multi language support
This commit is contained in:
parent
309f148700
commit
64093c58a2
@ -1,7 +1,5 @@
|
||||
import pickle
|
||||
import selenium
|
||||
import selenium.webdriver
|
||||
import time
|
||||
import logging
|
||||
|
||||
settings = {}
|
||||
@ -10,7 +8,16 @@ def init(settings_obj):
|
||||
global settings
|
||||
settings = settings_obj
|
||||
|
||||
def create_webdriver():
|
||||
def create_webdriver(driver='chrome'):
|
||||
if driver == 'chrome':
|
||||
return create_webdriver_chrome()
|
||||
elif driver == 'firefox':
|
||||
return create_webdriver_firefox()
|
||||
|
||||
def create_webdriver_firefox():
|
||||
pass
|
||||
|
||||
def create_webdriver_chrome():
|
||||
opt = selenium.webdriver.chrome.options.Options()
|
||||
opt.add_argument('--user-agent=' + settings.WEBDRIVER_USER_AGENT)
|
||||
opt.add_argument('--kiosk-printing')
|
||||
|
@ -31,6 +31,7 @@ def main():
|
||||
class InteractiveInterface:
|
||||
def __init__(self):
|
||||
self.model = model.Model()
|
||||
x = self.model.webdriver # Request the browser open immediately
|
||||
|
||||
def run(self, args, main=True):
|
||||
try:
|
||||
|
@ -25,7 +25,9 @@ class Model:
|
||||
@property
|
||||
def webdriver(self):
|
||||
if not self._webdriver:
|
||||
self._webdriver = browser.create_webdriver()
|
||||
self._webdriver = browser.create_webdriver(settings.WEBDRIVER_BROWSER)
|
||||
page_api = pages.LandingPage(self.webdriver)
|
||||
page_api.goto_landing_page()
|
||||
return self._webdriver
|
||||
|
||||
def do_random_page(self):
|
||||
@ -39,7 +41,6 @@ class Model:
|
||||
page_api.goto_random_article()
|
||||
|
||||
# Article page
|
||||
|
||||
pages_visited = []
|
||||
while True:
|
||||
page_api = pages.ArticlePage(self.webdriver)
|
||||
|
@ -2,6 +2,7 @@ import logging
|
||||
import re
|
||||
import selenium
|
||||
import selenium.webdriver
|
||||
import time
|
||||
|
||||
settings = {}
|
||||
|
||||
@ -22,6 +23,7 @@ class PageRootObject:
|
||||
|
||||
def click(self, el):
|
||||
self.highlight(el, 'red')
|
||||
time.sleep(settings.PAGE_DELAY)
|
||||
el.click()
|
||||
|
||||
def highlight(self, el, color):
|
||||
@ -47,7 +49,7 @@ class MainPage(PageRootObject):
|
||||
super().__init__(driver)
|
||||
|
||||
def goto_random_article(self):
|
||||
link = self.driver.find_element_by_partial_link_text('Random article')
|
||||
link = self.driver.find_element_by_xpath('//li[contains(@id, "n-randompage")]/a')
|
||||
self.click(link)
|
||||
|
||||
class ArticlePage(PageRootObject):
|
||||
|
13
settings.py
13
settings.py
@ -3,19 +3,18 @@ import logging
|
||||
class Settings:
|
||||
# Application Parameters
|
||||
LOG_LEVEL = logging.INFO
|
||||
DO_BREAKPOINTS = False
|
||||
DO_BREAKPOINTS = True
|
||||
PAGE_DELAY = 0
|
||||
|
||||
# Web Driver Parameters
|
||||
WEBDRIVER_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)'
|
||||
WEBDRIVER_SERIALIZE_DUMP_LOC = '/tmp/saved_webdrivers.pickle'
|
||||
WEBDRIVER_STORED_NAME = 'sok-scrape'
|
||||
WEBDRIVER_EXECUTOR_PORT = 4444
|
||||
WEBDRIVER_REMOTE_EXECUTOR = 'http://127.0.0.1:%s/wd/hub'
|
||||
WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox'
|
||||
|
||||
# Web Page Parameters
|
||||
# Wikipedia Parameters
|
||||
PAGE_BASE_URL = 'https://www.wikipedia.org/'
|
||||
PAGE_LANGUAGE = 'English'
|
||||
PAGE_DELAY = 0
|
||||
# PAGE_LANGUAGE = 'Español'
|
||||
# PAGE_LANGUAGE = 'Русский'
|
||||
|
||||
# Data Layer Parameters
|
||||
SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
|
||||
|
Loading…
Reference in New Issue
Block a user