From 64093c58a285073a32dda3256e1a0d4d231711a1 Mon Sep 17 00:00:00 2001 From: Mathew Guest Date: Thu, 17 Aug 2017 01:45:07 -0600 Subject: [PATCH] cli and multi language support --- app/browser.py | 13 ++++++++++--- app/cli.py | 1 + app/model.py | 5 +++-- app/pages.py | 4 +++- settings.py | 13 ++++++------- 5 files changed, 23 insertions(+), 13 deletions(-) diff --git a/app/browser.py b/app/browser.py index e569a6a..86a5ce4 100644 --- a/app/browser.py +++ b/app/browser.py @@ -1,7 +1,5 @@ -import pickle import selenium import selenium.webdriver -import time import logging settings = {} @@ -10,7 +8,16 @@ def init(settings_obj): global settings settings = settings_obj -def create_webdriver(): +def create_webdriver(driver='chrome'): + if driver == 'chrome': + return create_webdriver_chrome() + elif driver == 'firefox': + return create_webdriver_firefox() + +def create_webdriver_firefox(): + pass + +def create_webdriver_chrome(): opt = selenium.webdriver.chrome.options.Options() opt.add_argument('--user-agent=' + settings.WEBDRIVER_USER_AGENT) opt.add_argument('--kiosk-printing') diff --git a/app/cli.py b/app/cli.py index d62fafb..67fdba9 100644 --- a/app/cli.py +++ b/app/cli.py @@ -31,6 +31,7 @@ def main(): class InteractiveInterface: def __init__(self): self.model = model.Model() + x = self.model.webdriver # Request the browser open immediately def run(self, args, main=True): try: diff --git a/app/model.py b/app/model.py index 3db6eea..85cb1d7 100644 --- a/app/model.py +++ b/app/model.py @@ -25,7 +25,9 @@ class Model: @property def webdriver(self): if not self._webdriver: - self._webdriver = browser.create_webdriver() + self._webdriver = browser.create_webdriver(settings.WEBDRIVER_BROWSER) + page_api = pages.LandingPage(self.webdriver) + page_api.goto_landing_page() return self._webdriver def do_random_page(self): @@ -39,7 +41,6 @@ class Model: page_api.goto_random_article() # Article page - pages_visited = [] while True: page_api = pages.ArticlePage(self.webdriver) diff --git a/app/pages.py b/app/pages.py index 6f7aa10..bcf24a1 100644 --- a/app/pages.py +++ b/app/pages.py @@ -2,6 +2,7 @@ import logging import re import selenium import selenium.webdriver +import time settings = {} @@ -22,6 +23,7 @@ class PageRootObject: def click(self, el): self.highlight(el, 'red') + time.sleep(settings.PAGE_DELAY) el.click() def highlight(self, el, color): @@ -47,7 +49,7 @@ class MainPage(PageRootObject): super().__init__(driver) def goto_random_article(self): - link = self.driver.find_element_by_partial_link_text('Random article') + link = self.driver.find_element_by_xpath('//li[contains(@id, "n-randompage")]/a') self.click(link) class ArticlePage(PageRootObject): diff --git a/settings.py b/settings.py index 013da02..b39979e 100644 --- a/settings.py +++ b/settings.py @@ -3,19 +3,18 @@ import logging class Settings: # Application Parameters LOG_LEVEL = logging.INFO - DO_BREAKPOINTS = False + DO_BREAKPOINTS = True + PAGE_DELAY = 0 # Web Driver Parameters WEBDRIVER_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)' - WEBDRIVER_SERIALIZE_DUMP_LOC = '/tmp/saved_webdrivers.pickle' - WEBDRIVER_STORED_NAME = 'sok-scrape' - WEBDRIVER_EXECUTOR_PORT = 4444 - WEBDRIVER_REMOTE_EXECUTOR = 'http://127.0.0.1:%s/wd/hub' + WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox' - # Web Page Parameters + # Wikipedia Parameters PAGE_BASE_URL = 'https://www.wikipedia.org/' PAGE_LANGUAGE = 'English' - PAGE_DELAY = 0 + # PAGE_LANGUAGE = 'Español' + # PAGE_LANGUAGE = 'Русский' # Data Layer Parameters SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'