cli and multi language support

This commit is contained in:
Mathew Guest 2017-08-17 01:45:07 -06:00
parent 309f148700
commit 64093c58a2
5 changed files with 23 additions and 13 deletions

@ -1,7 +1,5 @@
import pickle
import selenium import selenium
import selenium.webdriver import selenium.webdriver
import time
import logging import logging
settings = {} settings = {}
@ -10,7 +8,16 @@ def init(settings_obj):
global settings global settings
settings = settings_obj settings = settings_obj
def create_webdriver(): def create_webdriver(driver='chrome'):
if driver == 'chrome':
return create_webdriver_chrome()
elif driver == 'firefox':
return create_webdriver_firefox()
def create_webdriver_firefox():
pass
def create_webdriver_chrome():
opt = selenium.webdriver.chrome.options.Options() opt = selenium.webdriver.chrome.options.Options()
opt.add_argument('--user-agent=' + settings.WEBDRIVER_USER_AGENT) opt.add_argument('--user-agent=' + settings.WEBDRIVER_USER_AGENT)
opt.add_argument('--kiosk-printing') opt.add_argument('--kiosk-printing')

@ -31,6 +31,7 @@ def main():
class InteractiveInterface: class InteractiveInterface:
def __init__(self): def __init__(self):
self.model = model.Model() self.model = model.Model()
x = self.model.webdriver # Request the browser open immediately
def run(self, args, main=True): def run(self, args, main=True):
try: try:

@ -25,7 +25,9 @@ class Model:
@property @property
def webdriver(self): def webdriver(self):
if not self._webdriver: if not self._webdriver:
self._webdriver = browser.create_webdriver() self._webdriver = browser.create_webdriver(settings.WEBDRIVER_BROWSER)
page_api = pages.LandingPage(self.webdriver)
page_api.goto_landing_page()
return self._webdriver return self._webdriver
def do_random_page(self): def do_random_page(self):
@ -39,7 +41,6 @@ class Model:
page_api.goto_random_article() page_api.goto_random_article()
# Article page # Article page
pages_visited = [] pages_visited = []
while True: while True:
page_api = pages.ArticlePage(self.webdriver) page_api = pages.ArticlePage(self.webdriver)

@ -2,6 +2,7 @@ import logging
import re import re
import selenium import selenium
import selenium.webdriver import selenium.webdriver
import time
settings = {} settings = {}
@ -22,6 +23,7 @@ class PageRootObject:
def click(self, el): def click(self, el):
self.highlight(el, 'red') self.highlight(el, 'red')
time.sleep(settings.PAGE_DELAY)
el.click() el.click()
def highlight(self, el, color): def highlight(self, el, color):
@ -47,7 +49,7 @@ class MainPage(PageRootObject):
super().__init__(driver) super().__init__(driver)
def goto_random_article(self): def goto_random_article(self):
link = self.driver.find_element_by_partial_link_text('Random article') link = self.driver.find_element_by_xpath('//li[contains(@id, "n-randompage")]/a')
self.click(link) self.click(link)
class ArticlePage(PageRootObject): class ArticlePage(PageRootObject):

@ -3,19 +3,18 @@ import logging
class Settings: class Settings:
# Application Parameters # Application Parameters
LOG_LEVEL = logging.INFO LOG_LEVEL = logging.INFO
DO_BREAKPOINTS = False DO_BREAKPOINTS = True
PAGE_DELAY = 0
# Web Driver Parameters # Web Driver Parameters
WEBDRIVER_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)' WEBDRIVER_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)'
WEBDRIVER_SERIALIZE_DUMP_LOC = '/tmp/saved_webdrivers.pickle' WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox'
WEBDRIVER_STORED_NAME = 'sok-scrape'
WEBDRIVER_EXECUTOR_PORT = 4444
WEBDRIVER_REMOTE_EXECUTOR = 'http://127.0.0.1:%s/wd/hub'
# Web Page Parameters # Wikipedia Parameters
PAGE_BASE_URL = 'https://www.wikipedia.org/' PAGE_BASE_URL = 'https://www.wikipedia.org/'
PAGE_LANGUAGE = 'English' PAGE_LANGUAGE = 'English'
PAGE_DELAY = 0 # PAGE_LANGUAGE = 'Español'
# PAGE_LANGUAGE = 'Русский'
# Data Layer Parameters # Data Layer Parameters
SQLITE_DBFILE = '/home/mathew/.wikicrawler.db' SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'