From f093fb9ecc4a11aa38371d3b3c7a1cfdbe002be4 Mon Sep 17 00:00:00 2001 From: Mathew Guest Date: Fri, 20 Oct 2017 17:39:00 -0600 Subject: [PATCH] some polish --- INSTALL.txt | 9 ++++++ README.md | 17 +++++++++++ launcher.py | 2 ++ settings.py | 10 +++++-- wikicrawl/browser.py | 3 +- wikicrawl/config.py | 9 ++++++ wikicrawl/log.py | 6 ++++ wikicrawl/main.py | 4 +++ wikicrawl/pages.py | 67 ++++++++++++++++++++++++++++++++++++-------- 9 files changed, 112 insertions(+), 15 deletions(-) create mode 100644 INSTALL.txt mode change 100644 => 100755 launcher.py mode change 100644 => 100755 wikicrawl/browser.py diff --git a/INSTALL.txt b/INSTALL.txt new file mode 100644 index 0000000..bd1d4e8 --- /dev/null +++ b/INSTALL.txt @@ -0,0 +1,9 @@ +For this to run you need either + +(1) +Google Chrome + Google Chrome WebDriver installed + +https://sites.google.com/a/chromium.org/chromedriver/home + +You want the windows binary, it's this one: +https://chromedriver.storage.googleapis.com/2.31/chromedriver_win32.zip diff --git a/README.md b/README.md index 8b13789..f9cb423 100644 --- a/README.md +++ b/README.md @@ -1 +1,18 @@ += Wikicrawl = +This application plays the road to philosophy game on wikipedia. An +interface is given where the user can launch a browser and have it +repeatedly click the first link on wikipedia articles until it reaches the article on philosophy. Apparently this works for ~97% +of pages. +settings.py: Contains runtime parameters. There are a few things +that need configured correctly for this to run. + +launcher.py: Run to start the program. "python ./launcher.py" is the command you need. + +setup.py: Installation configuration for third-party dependencies. + +To install, run these commands in this directory: + +virtualenv pythonenv +python setup.py install +python ./launcher.py diff --git a/launcher.py b/launcher.py old mode 100644 new mode 100755 index facdc09..4d37cba --- a/launcher.py +++ b/launcher.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import wikicrawl import settings diff --git a/settings.py b/settings.py index 31ee594..ba28a74 100644 --- a/settings.py +++ b/settings.py @@ -16,6 +16,11 @@ class DefaultSettings: # Filepath parameters - THESE MUST EXIST OR PROGRAM WILL NOT RUN!! LOG_FILENAME = '/tmp/wikicrawl.log' SQLITE_DBFILE = '/home/mathew/.wikicrawler.db' + CHROMEDRIVER_EXE = '/usr/bin/chromedriver' + + + # CHROMEDRIVER_EXE = 'C:\\Users\\mathew\\windows-share\\dev\\wikicrawl\\chromedriver.exe' + # Application Parameters DO_BREAKPOINTS = False @@ -40,6 +45,7 @@ class DefaultSettings: PAGE_LANGUAGE = 'en' # API Keys + # Yandex is a web REST API for translating between different languages. YANDEX_API_KEY = 'trnsl.1.1.20170825T194642Z.26862b9dd4c1a755.9490ed28de448ff67522c2854f262eff05ec0dc3' # Logging Parameters @@ -48,7 +54,7 @@ class DefaultSettings: 'formatters': { 'colored': { '()': 'colorlog.ColoredFormatter', - 'format': '%(log_color)s%(levelname)-8s%(reset)s:%(log_color)s%(name)-5s%(reset)s:%(blue)s%(message)s' + 'format': '%(log_color)s%(levelname)-8s%(reset)s:%(log_color)s%(name)-5s%(reset)s:%(white)s%(message)s' }, 'basic': { '()': 'logging.Formatter', @@ -106,5 +112,3 @@ class DefaultSettings: } } - - diff --git a/wikicrawl/browser.py b/wikicrawl/browser.py old mode 100644 new mode 100755 index 5fc9357..a4a7f29 --- a/wikicrawl/browser.py +++ b/wikicrawl/browser.py @@ -40,7 +40,8 @@ def create_webdriver_firefox(): def create_webdriver_chrome(): opt = selenium.webdriver.chrome.options.Options() opt.add_argument('--user-agent=' + config.obj.WEBDRIVER_USER_AGENT) - driver = selenium.webdriver.Chrome(chrome_options = opt) + driver = selenium.webdriver.Chrome(executable_path=config.obj.CHROMEDRIVER_EXE, + chrome_options=opt) return driver def create_webdriver_phantom(): diff --git a/wikicrawl/config.py b/wikicrawl/config.py index 290d66f..c4695d5 100644 --- a/wikicrawl/config.py +++ b/wikicrawl/config.py @@ -2,6 +2,8 @@ # and is used to provide an interface to the runtime configuration for the # program. +import sys + from . import log obj = {} @@ -9,5 +11,12 @@ obj = {} def init(settings_obj): global obj obj = settings_obj + + find_chromedriver_path() + + log.init_logging() +def find_chromedriver_path(): + print(__file__) + diff --git a/wikicrawl/log.py b/wikicrawl/log.py index a1dbc37..4a14a36 100644 --- a/wikicrawl/log.py +++ b/wikicrawl/log.py @@ -20,6 +20,12 @@ class LoggingLayer: self.loggers = {} logging.config.dictConfig(config) + # Note on __getitem__: + # __getitem__ overrides the functionality of the [] operator. + # That means this code: + # objinstance = LoggingLayer(...) + # objinstance[foo] calls LoggingLayer.__getitem__(foo) + # and returns the result. def __getitem__(self, k): logger = self.loggers.get(k) if not logger: diff --git a/wikicrawl/main.py b/wikicrawl/main.py index a7a39f0..29cf839 100644 --- a/wikicrawl/main.py +++ b/wikicrawl/main.py @@ -3,6 +3,7 @@ import sys from . import cli +from . import config from . import util def main(): @@ -14,6 +15,9 @@ def main(): else: user_interface.start_command_loop() +def verify_config_is_valid(): + pass + if __name__ == '__main__': main() diff --git a/wikicrawl/pages.py b/wikicrawl/pages.py index f4bad9b..865cdf3 100644 --- a/wikicrawl/pages.py +++ b/wikicrawl/pages.py @@ -72,7 +72,7 @@ class PageRootObject: # js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff' else: # color = color - js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color + pass js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color self.driver.execute_script(js, el) @@ -135,7 +135,7 @@ class ArticlePage(PageRootObject): # select unique elements with high confidence. elements = { 'main-window-content-text-id': 'mw-content-text', - 'article-title': 'firstHeading', + 'article-title-id': 'firstHeading', } def __init__(self, driver=None): @@ -145,7 +145,7 @@ class ArticlePage(PageRootObject): """ Returns the article title. """ - heading = self.driver.find_element_by_id(ArticlePage.elements['article-title']) + heading = self.driver.find_element_by_id(ArticlePage.elements['article-title-id']) return heading.text def click_first_link(self): @@ -175,7 +175,30 @@ class ArticlePage(PageRootObject): the first link, for performance optimization. """ main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id']) - paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]') + + # Note on xpath (more advanced web automation tool): + # xpath is another language that allows you to concisely specify an + # element in the page (or more generally any xml or html structured + # text). You can define a hierarchical tree structure that an element + # must have, attributes that any of the nodes must have or not have, + # and even some more complex functionality. For example, you can + # say in xpath to give me all the links that are a child of the + # navigation menu. This xpath here looks for paragraph elements + # that fall under this structure: + # + #
+ #
+ #

+ # ...wikipedia article content... + # + # and does NOT have: + # ... + # + #

+ #
+ #
+ xpath_str = './div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]' + paragraphs = main_window.find_elements_by_xpath(xpath_str) for p in paragraphs: # Return code indicates the success status of _parse_paragraph(). # In this case, an rc of True means that it was able to find a @@ -243,8 +266,15 @@ class ArticlePage(PageRootObject): link_text = el.get_attribute('outerHTML') p_text = p.get_attribute('innerHTML') - regex_str = '\(.*?\)' # Regular expression to extract the - # text inside (not nested) parenthesis + # Note on regular expressions (advanced): + # Regular expressions or regexes are another language for + # matching patterns in raw text (regex is blind to html structure). + # Regular expressions are notorious because they can be + # hard to understand and read but they are extremely + # expressive, i.e. you can convey a great deal of functionality + # in one line of code. This one below is among the simplest and + # just grabs text inside a pair of parenthesis. + regex_str = '\(.*?\)' regex = re.compile(regex_str, flags=re.UNICODE) match = regex.search(p_text) if not match: @@ -258,11 +288,13 @@ class ArticlePage(PageRootObject): # # Care must be taken with regular expressions as they are # user/developer unfriendly, hard-to-read, and unforgiving. - # For example, what happens when you try to match () - # inside of (some words) some more words (even more words), you - # can match unpaired parenthesis and the computer will return - # unexpected results. The code is quite dumb and does exactly - # what you tell it to. + # For example, what happens when you try to extract text inside parenthesis + # inside of "(some words) some more words (even more words)", you + # can match unpaired parenthesis. This means "some more words" is indeed + # inside a pair of parenthesis, the pair given by the entire string. + # The code is quite dumb and does exactly what you tell it to. + # Often this leads to unexpected results because the computer + # is so literal. match_text = match.group(0) match_idx = match.end(0) if link_text in match_text: @@ -271,6 +303,17 @@ class ArticlePage(PageRootObject): return False + # Side Teaser: This is a regular expression to match valid email + # addresses. It reads 1-or more alphanumeric characters (also _ and . and -) + # followed by a '@' followed by 1-or more alphanumeric characters + # (also . and -), followed by a period and between 2 and 6 lower-cased + # characters a-z (and . accepted). + # + # /^([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$/ + # + # You can see how they get messy quick but one line of regex could + # do the same thing that would take 100 lines of imperative code. + def _is_link_a_footnote(self, el): # Some links are anchors to footnotes, e.g. [1] that points to a source # at the bottom of the page. These aren't valid links for our purpose @@ -302,6 +345,8 @@ class ArticlePage(PageRootObject): return False def _is_not_wikipedia(self, el): + # Some links point to websites outside of wikipedia, we skip those + # with this. href = el.get_attribute('href') if 'wikipedia.org' not in href: return True