mirror of
https://git.zavage.net/Zavage-Software/wikicrawl.git
synced 2024-11-22 00:00:25 -07:00
cli and multi language support
This commit is contained in:
parent
309f148700
commit
64093c58a2
@ -1,7 +1,5 @@
|
|||||||
import pickle
|
|
||||||
import selenium
|
import selenium
|
||||||
import selenium.webdriver
|
import selenium.webdriver
|
||||||
import time
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
settings = {}
|
settings = {}
|
||||||
@ -10,7 +8,16 @@ def init(settings_obj):
|
|||||||
global settings
|
global settings
|
||||||
settings = settings_obj
|
settings = settings_obj
|
||||||
|
|
||||||
def create_webdriver():
|
def create_webdriver(driver='chrome'):
|
||||||
|
if driver == 'chrome':
|
||||||
|
return create_webdriver_chrome()
|
||||||
|
elif driver == 'firefox':
|
||||||
|
return create_webdriver_firefox()
|
||||||
|
|
||||||
|
def create_webdriver_firefox():
|
||||||
|
pass
|
||||||
|
|
||||||
|
def create_webdriver_chrome():
|
||||||
opt = selenium.webdriver.chrome.options.Options()
|
opt = selenium.webdriver.chrome.options.Options()
|
||||||
opt.add_argument('--user-agent=' + settings.WEBDRIVER_USER_AGENT)
|
opt.add_argument('--user-agent=' + settings.WEBDRIVER_USER_AGENT)
|
||||||
opt.add_argument('--kiosk-printing')
|
opt.add_argument('--kiosk-printing')
|
||||||
|
@ -31,6 +31,7 @@ def main():
|
|||||||
class InteractiveInterface:
|
class InteractiveInterface:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.model = model.Model()
|
self.model = model.Model()
|
||||||
|
x = self.model.webdriver # Request the browser open immediately
|
||||||
|
|
||||||
def run(self, args, main=True):
|
def run(self, args, main=True):
|
||||||
try:
|
try:
|
||||||
|
@ -25,7 +25,9 @@ class Model:
|
|||||||
@property
|
@property
|
||||||
def webdriver(self):
|
def webdriver(self):
|
||||||
if not self._webdriver:
|
if not self._webdriver:
|
||||||
self._webdriver = browser.create_webdriver()
|
self._webdriver = browser.create_webdriver(settings.WEBDRIVER_BROWSER)
|
||||||
|
page_api = pages.LandingPage(self.webdriver)
|
||||||
|
page_api.goto_landing_page()
|
||||||
return self._webdriver
|
return self._webdriver
|
||||||
|
|
||||||
def do_random_page(self):
|
def do_random_page(self):
|
||||||
@ -39,7 +41,6 @@ class Model:
|
|||||||
page_api.goto_random_article()
|
page_api.goto_random_article()
|
||||||
|
|
||||||
# Article page
|
# Article page
|
||||||
|
|
||||||
pages_visited = []
|
pages_visited = []
|
||||||
while True:
|
while True:
|
||||||
page_api = pages.ArticlePage(self.webdriver)
|
page_api = pages.ArticlePage(self.webdriver)
|
||||||
|
@ -2,6 +2,7 @@ import logging
|
|||||||
import re
|
import re
|
||||||
import selenium
|
import selenium
|
||||||
import selenium.webdriver
|
import selenium.webdriver
|
||||||
|
import time
|
||||||
|
|
||||||
settings = {}
|
settings = {}
|
||||||
|
|
||||||
@ -22,6 +23,7 @@ class PageRootObject:
|
|||||||
|
|
||||||
def click(self, el):
|
def click(self, el):
|
||||||
self.highlight(el, 'red')
|
self.highlight(el, 'red')
|
||||||
|
time.sleep(settings.PAGE_DELAY)
|
||||||
el.click()
|
el.click()
|
||||||
|
|
||||||
def highlight(self, el, color):
|
def highlight(self, el, color):
|
||||||
@ -47,7 +49,7 @@ class MainPage(PageRootObject):
|
|||||||
super().__init__(driver)
|
super().__init__(driver)
|
||||||
|
|
||||||
def goto_random_article(self):
|
def goto_random_article(self):
|
||||||
link = self.driver.find_element_by_partial_link_text('Random article')
|
link = self.driver.find_element_by_xpath('//li[contains(@id, "n-randompage")]/a')
|
||||||
self.click(link)
|
self.click(link)
|
||||||
|
|
||||||
class ArticlePage(PageRootObject):
|
class ArticlePage(PageRootObject):
|
||||||
|
13
settings.py
13
settings.py
@ -3,19 +3,18 @@ import logging
|
|||||||
class Settings:
|
class Settings:
|
||||||
# Application Parameters
|
# Application Parameters
|
||||||
LOG_LEVEL = logging.INFO
|
LOG_LEVEL = logging.INFO
|
||||||
DO_BREAKPOINTS = False
|
DO_BREAKPOINTS = True
|
||||||
|
PAGE_DELAY = 0
|
||||||
|
|
||||||
# Web Driver Parameters
|
# Web Driver Parameters
|
||||||
WEBDRIVER_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)'
|
WEBDRIVER_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)'
|
||||||
WEBDRIVER_SERIALIZE_DUMP_LOC = '/tmp/saved_webdrivers.pickle'
|
WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox'
|
||||||
WEBDRIVER_STORED_NAME = 'sok-scrape'
|
|
||||||
WEBDRIVER_EXECUTOR_PORT = 4444
|
|
||||||
WEBDRIVER_REMOTE_EXECUTOR = 'http://127.0.0.1:%s/wd/hub'
|
|
||||||
|
|
||||||
# Web Page Parameters
|
# Wikipedia Parameters
|
||||||
PAGE_BASE_URL = 'https://www.wikipedia.org/'
|
PAGE_BASE_URL = 'https://www.wikipedia.org/'
|
||||||
PAGE_LANGUAGE = 'English'
|
PAGE_LANGUAGE = 'English'
|
||||||
PAGE_DELAY = 0
|
# PAGE_LANGUAGE = 'Español'
|
||||||
|
# PAGE_LANGUAGE = 'Русский'
|
||||||
|
|
||||||
# Data Layer Parameters
|
# Data Layer Parameters
|
||||||
SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
|
SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
|
||||||
|
Loading…
Reference in New Issue
Block a user