From f093fb9ecc4a11aa38371d3b3c7a1cfdbe002be4 Mon Sep 17 00:00:00 2001
From: Mathew Guest <mathewguest@gmail.com>
Date: Fri, 20 Oct 2017 17:39:00 -0600
Subject: [PATCH] some polish

---
 INSTALL.txt          |  9 ++++++
 README.md            | 17 +++++++++++
 launcher.py          |  2 ++
 settings.py          | 10 +++++--
 wikicrawl/browser.py |  3 +-
 wikicrawl/config.py  |  9 ++++++
 wikicrawl/log.py     |  6 ++++
 wikicrawl/main.py    |  4 +++
 wikicrawl/pages.py   | 67 ++++++++++++++++++++++++++++++++++++--------
 9 files changed, 112 insertions(+), 15 deletions(-)
 create mode 100644 INSTALL.txt
 mode change 100644 => 100755 launcher.py
 mode change 100644 => 100755 wikicrawl/browser.py

diff --git a/INSTALL.txt b/INSTALL.txt
new file mode 100644
index 0000000..bd1d4e8
--- /dev/null
+++ b/INSTALL.txt
@@ -0,0 +1,9 @@
+For this to run you need either
+
+(1)
+Google Chrome + Google Chrome WebDriver installed
+
+https://sites.google.com/a/chromium.org/chromedriver/home
+
+You want the windows binary, it's this one:
+https://chromedriver.storage.googleapis.com/2.31/chromedriver_win32.zip
diff --git a/README.md b/README.md
index 8b13789..f9cb423 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,18 @@
+= Wikicrawl =
+This application plays the road to philosophy game on wikipedia. An
+interface is given where the user can launch a browser and have it
+repeatedly click the first link on wikipedia articles until it reaches the article on philosophy. Apparently this works for ~97%
+of pages.
 
+settings.py: Contains runtime parameters. There are a few things
+that need configured correctly for this to run.
+
+launcher.py: Run to start the program. "python ./launcher.py" is the command you need.
+
+setup.py: Installation configuration for third-party dependencies.
+
+To install, run these commands in this directory:
+
+virtualenv pythonenv
+python setup.py install
+python ./launcher.py
diff --git a/launcher.py b/launcher.py
old mode 100644
new mode 100755
index facdc09..4d37cba
--- a/launcher.py
+++ b/launcher.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 import wikicrawl
 import settings
 
diff --git a/settings.py b/settings.py
index 31ee594..ba28a74 100644
--- a/settings.py
+++ b/settings.py
@@ -16,6 +16,11 @@ class DefaultSettings:
     # Filepath parameters - THESE MUST EXIST OR PROGRAM WILL NOT RUN!!
     LOG_FILENAME = '/tmp/wikicrawl.log'
     SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
+    CHROMEDRIVER_EXE = '/usr/bin/chromedriver'
+
+
+    # CHROMEDRIVER_EXE = 'C:\\Users\\mathew\\windows-share\\dev\\wikicrawl\\chromedriver.exe'
+
 
     # Application Parameters
     DO_BREAKPOINTS = False
@@ -40,6 +45,7 @@ class DefaultSettings:
     PAGE_LANGUAGE = 'en'
 
     # API Keys
+    # Yandex is a web REST API for translating between different languages.
     YANDEX_API_KEY = 'trnsl.1.1.20170825T194642Z.26862b9dd4c1a755.9490ed28de448ff67522c2854f262eff05ec0dc3'
 
     # Logging Parameters
@@ -48,7 +54,7 @@ class DefaultSettings:
         'formatters': {
             'colored': {
                 '()': 'colorlog.ColoredFormatter',
-                'format': '%(log_color)s%(levelname)-8s%(reset)s:%(log_color)s%(name)-5s%(reset)s:%(blue)s%(message)s'
+                'format': '%(log_color)s%(levelname)-8s%(reset)s:%(log_color)s%(name)-5s%(reset)s:%(white)s%(message)s'
             },
             'basic': {
                 '()': 'logging.Formatter',
@@ -106,5 +112,3 @@ class DefaultSettings:
         }
     }
 
-
-
diff --git a/wikicrawl/browser.py b/wikicrawl/browser.py
old mode 100644
new mode 100755
index 5fc9357..a4a7f29
--- a/wikicrawl/browser.py
+++ b/wikicrawl/browser.py
@@ -40,7 +40,8 @@ def create_webdriver_firefox():
 def create_webdriver_chrome():
     opt = selenium.webdriver.chrome.options.Options()
     opt.add_argument('--user-agent=' + config.obj.WEBDRIVER_USER_AGENT)
-    driver = selenium.webdriver.Chrome(chrome_options = opt)
+    driver = selenium.webdriver.Chrome(executable_path=config.obj.CHROMEDRIVER_EXE,
+                                       chrome_options=opt)
     return driver
 
 def create_webdriver_phantom():
diff --git a/wikicrawl/config.py b/wikicrawl/config.py
index 290d66f..c4695d5 100644
--- a/wikicrawl/config.py
+++ b/wikicrawl/config.py
@@ -2,6 +2,8 @@
 # and is used to provide an interface to the runtime configuration for the
 # program.
 
+import sys
+
 from . import log
 
 obj = {}
@@ -9,5 +11,12 @@ obj = {}
 def init(settings_obj):
     global obj
     obj = settings_obj
+    
+    find_chromedriver_path()
+
+
     log.init_logging()
 
+def find_chromedriver_path():
+    print(__file__)
+
diff --git a/wikicrawl/log.py b/wikicrawl/log.py
index a1dbc37..4a14a36 100644
--- a/wikicrawl/log.py
+++ b/wikicrawl/log.py
@@ -20,6 +20,12 @@ class LoggingLayer:
         self.loggers = {}
         logging.config.dictConfig(config)
 
+    # Note on __getitem__:
+    # __getitem__ overrides the functionality of the [] operator.
+    # That means this code:
+    # objinstance = LoggingLayer(...)
+    # objinstance[foo] calls LoggingLayer.__getitem__(foo)
+    # and returns the result.
     def __getitem__(self, k):
         logger = self.loggers.get(k)
         if not logger:
diff --git a/wikicrawl/main.py b/wikicrawl/main.py
index a7a39f0..29cf839 100644
--- a/wikicrawl/main.py
+++ b/wikicrawl/main.py
@@ -3,6 +3,7 @@
 import sys
 
 from . import cli
+from . import config
 from . import util
 
 def main():
@@ -14,6 +15,9 @@ def main():
     else:
         user_interface.start_command_loop()
 
+def verify_config_is_valid():
+    pass
+
 if __name__ == '__main__':
     main()
 
diff --git a/wikicrawl/pages.py b/wikicrawl/pages.py
index f4bad9b..865cdf3 100644
--- a/wikicrawl/pages.py
+++ b/wikicrawl/pages.py
@@ -72,7 +72,7 @@ class PageRootObject:
             # js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
         else:
             # color = color
-            js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
+            pass
         js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
         self.driver.execute_script(js, el)
 
@@ -135,7 +135,7 @@ class ArticlePage(PageRootObject):
     # select unique elements with high confidence.
     elements = {
         'main-window-content-text-id': 'mw-content-text',
-        'article-title': 'firstHeading',
+        'article-title-id': 'firstHeading',
     }
 
     def __init__(self, driver=None):
@@ -145,7 +145,7 @@ class ArticlePage(PageRootObject):
         """
         Returns the article title.
         """
-        heading = self.driver.find_element_by_id(ArticlePage.elements['article-title'])
+        heading = self.driver.find_element_by_id(ArticlePage.elements['article-title-id'])
         return heading.text
 
     def click_first_link(self):
@@ -175,7 +175,30 @@ class ArticlePage(PageRootObject):
         the first link, for performance optimization.
         """
         main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
-        paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
+
+        # Note on xpath (more advanced web automation tool):
+        # xpath is another language that allows you to concisely specify an
+        # element in the page (or more generally any xml or html structured
+        # text). You can define a hierarchical tree structure that an element
+        # must have, attributes that any of the nodes must have or not have,
+        # and even some more complex functionality. For example, you can
+        # say in xpath to give me all the links that are a child of the
+        # navigation menu. This xpath here looks for paragraph elements
+        # that fall under this structure:
+        #
+        # <div id="mw-content-text">
+        #   <div class="mw-parser-output">
+        #     <p>
+        #       ...wikipedia article content...
+        #
+        #       and does NOT have:
+        #       <span><span id="coordinates">...</span></span>
+        #
+        #     </p>
+        #   </div>
+        # </div>
+        xpath_str = './div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]'
+        paragraphs = main_window.find_elements_by_xpath(xpath_str)
         for p in paragraphs:
             # Return code indicates the success status of _parse_paragraph().
             # In this case, an rc of True means that it was able to find a
@@ -243,8 +266,15 @@ class ArticlePage(PageRootObject):
         link_text = el.get_attribute('outerHTML')
         p_text = p.get_attribute('innerHTML')
 
-        regex_str = '\(.*?\)' # Regular expression to extract the
-                              # text inside (not nested) parenthesis
+        # Note on regular expressions (advanced):
+        # Regular expressions or regexes are another language for
+        # matching patterns in raw text (regex is blind to html structure).
+        # Regular expressions are notorious because they can be
+        # hard to understand and read but they are extremely
+        # expressive, i.e. you can convey a great deal of functionality
+        # in one line of code. This one below is among the simplest and
+        # just grabs text inside a pair of parenthesis.
+        regex_str = '\(.*?\)'
         regex = re.compile(regex_str, flags=re.UNICODE)
         match = regex.search(p_text)
         if not match:
@@ -258,11 +288,13 @@ class ArticlePage(PageRootObject):
             #
             # Care must be taken with regular expressions as they are
             # user/developer unfriendly, hard-to-read, and unforgiving.
-            # For example, what happens when you try to match (<anything>)
-            # inside of (some words) some more words (even more words), you
-            # can match unpaired parenthesis and the computer will return
-            # unexpected results. The code is quite dumb and does exactly
-            # what you tell it to.
+            # For example, what happens when you try to extract text inside parenthesis
+            # inside of "(some words) some more words (even more words)", you
+            # can match unpaired parenthesis. This means "some more words" is indeed
+            # inside a pair of parenthesis, the pair given by the entire string.
+            # The code is quite dumb and does exactly what you tell it to.
+            # Often this leads to unexpected results because the computer
+            # is so literal.
             match_text = match.group(0)
             match_idx = match.end(0)
             if link_text in match_text:
@@ -271,6 +303,17 @@ class ArticlePage(PageRootObject):
 
         return False
 
+        # Side Teaser: This is a regular expression to match valid email
+        # addresses. It reads 1-or more alphanumeric characters (also _ and . and -)
+        # followed by a '@' followed by 1-or more alphanumeric characters
+        # (also . and -), followed by a period and between 2 and 6 lower-cased
+        # characters a-z (and . accepted).
+        # 
+        # /^([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$/
+        #
+        # You can see how they get messy quick but one line of regex could
+        # do the same thing that would take 100 lines of imperative code.
+
     def _is_link_a_footnote(self, el):
         # Some links are anchors to footnotes, e.g. [1] that points to a source
         # at the bottom of the page. These aren't valid links for our purpose
@@ -302,6 +345,8 @@ class ArticlePage(PageRootObject):
         return False 
 
     def _is_not_wikipedia(self, el):
+        # Some links point to websites outside of wikipedia, we skip those
+        # with this.
         href = el.get_attribute('href')
         if 'wikipedia.org' not in href:
             return True