mirror of
https://git.zavage.net/Zavage-Software/wikicrawl.git
synced 2024-12-04 05:39:20 -07:00
some polish
This commit is contained in:
parent
9958b5b9aa
commit
f093fb9ecc
9
INSTALL.txt
Normal file
9
INSTALL.txt
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
For this to run you need either
|
||||||
|
|
||||||
|
(1)
|
||||||
|
Google Chrome + Google Chrome WebDriver installed
|
||||||
|
|
||||||
|
https://sites.google.com/a/chromium.org/chromedriver/home
|
||||||
|
|
||||||
|
You want the windows binary, it's this one:
|
||||||
|
https://chromedriver.storage.googleapis.com/2.31/chromedriver_win32.zip
|
17
README.md
17
README.md
@ -1 +1,18 @@
|
|||||||
|
= Wikicrawl =
|
||||||
|
This application plays the road to philosophy game on wikipedia. An
|
||||||
|
interface is given where the user can launch a browser and have it
|
||||||
|
repeatedly click the first link on wikipedia articles until it reaches the article on philosophy. Apparently this works for ~97%
|
||||||
|
of pages.
|
||||||
|
|
||||||
|
settings.py: Contains runtime parameters. There are a few things
|
||||||
|
that need configured correctly for this to run.
|
||||||
|
|
||||||
|
launcher.py: Run to start the program. "python ./launcher.py" is the command you need.
|
||||||
|
|
||||||
|
setup.py: Installation configuration for third-party dependencies.
|
||||||
|
|
||||||
|
To install, run these commands in this directory:
|
||||||
|
|
||||||
|
virtualenv pythonenv
|
||||||
|
python setup.py install
|
||||||
|
python ./launcher.py
|
||||||
|
2
launcher.py
Normal file → Executable file
2
launcher.py
Normal file → Executable file
@ -1,3 +1,5 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import wikicrawl
|
import wikicrawl
|
||||||
import settings
|
import settings
|
||||||
|
|
||||||
|
10
settings.py
10
settings.py
@ -16,6 +16,11 @@ class DefaultSettings:
|
|||||||
# Filepath parameters - THESE MUST EXIST OR PROGRAM WILL NOT RUN!!
|
# Filepath parameters - THESE MUST EXIST OR PROGRAM WILL NOT RUN!!
|
||||||
LOG_FILENAME = '/tmp/wikicrawl.log'
|
LOG_FILENAME = '/tmp/wikicrawl.log'
|
||||||
SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
|
SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
|
||||||
|
CHROMEDRIVER_EXE = '/usr/bin/chromedriver'
|
||||||
|
|
||||||
|
|
||||||
|
# CHROMEDRIVER_EXE = 'C:\\Users\\mathew\\windows-share\\dev\\wikicrawl\\chromedriver.exe'
|
||||||
|
|
||||||
|
|
||||||
# Application Parameters
|
# Application Parameters
|
||||||
DO_BREAKPOINTS = False
|
DO_BREAKPOINTS = False
|
||||||
@ -40,6 +45,7 @@ class DefaultSettings:
|
|||||||
PAGE_LANGUAGE = 'en'
|
PAGE_LANGUAGE = 'en'
|
||||||
|
|
||||||
# API Keys
|
# API Keys
|
||||||
|
# Yandex is a web REST API for translating between different languages.
|
||||||
YANDEX_API_KEY = 'trnsl.1.1.20170825T194642Z.26862b9dd4c1a755.9490ed28de448ff67522c2854f262eff05ec0dc3'
|
YANDEX_API_KEY = 'trnsl.1.1.20170825T194642Z.26862b9dd4c1a755.9490ed28de448ff67522c2854f262eff05ec0dc3'
|
||||||
|
|
||||||
# Logging Parameters
|
# Logging Parameters
|
||||||
@ -48,7 +54,7 @@ class DefaultSettings:
|
|||||||
'formatters': {
|
'formatters': {
|
||||||
'colored': {
|
'colored': {
|
||||||
'()': 'colorlog.ColoredFormatter',
|
'()': 'colorlog.ColoredFormatter',
|
||||||
'format': '%(log_color)s%(levelname)-8s%(reset)s:%(log_color)s%(name)-5s%(reset)s:%(blue)s%(message)s'
|
'format': '%(log_color)s%(levelname)-8s%(reset)s:%(log_color)s%(name)-5s%(reset)s:%(white)s%(message)s'
|
||||||
},
|
},
|
||||||
'basic': {
|
'basic': {
|
||||||
'()': 'logging.Formatter',
|
'()': 'logging.Formatter',
|
||||||
@ -106,5 +112,3 @@ class DefaultSettings:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
3
wikicrawl/browser.py
Normal file → Executable file
3
wikicrawl/browser.py
Normal file → Executable file
@ -40,7 +40,8 @@ def create_webdriver_firefox():
|
|||||||
def create_webdriver_chrome():
|
def create_webdriver_chrome():
|
||||||
opt = selenium.webdriver.chrome.options.Options()
|
opt = selenium.webdriver.chrome.options.Options()
|
||||||
opt.add_argument('--user-agent=' + config.obj.WEBDRIVER_USER_AGENT)
|
opt.add_argument('--user-agent=' + config.obj.WEBDRIVER_USER_AGENT)
|
||||||
driver = selenium.webdriver.Chrome(chrome_options = opt)
|
driver = selenium.webdriver.Chrome(executable_path=config.obj.CHROMEDRIVER_EXE,
|
||||||
|
chrome_options=opt)
|
||||||
return driver
|
return driver
|
||||||
|
|
||||||
def create_webdriver_phantom():
|
def create_webdriver_phantom():
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
# and is used to provide an interface to the runtime configuration for the
|
# and is used to provide an interface to the runtime configuration for the
|
||||||
# program.
|
# program.
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
from . import log
|
from . import log
|
||||||
|
|
||||||
obj = {}
|
obj = {}
|
||||||
@ -9,5 +11,12 @@ obj = {}
|
|||||||
def init(settings_obj):
|
def init(settings_obj):
|
||||||
global obj
|
global obj
|
||||||
obj = settings_obj
|
obj = settings_obj
|
||||||
|
|
||||||
|
find_chromedriver_path()
|
||||||
|
|
||||||
|
|
||||||
log.init_logging()
|
log.init_logging()
|
||||||
|
|
||||||
|
def find_chromedriver_path():
|
||||||
|
print(__file__)
|
||||||
|
|
||||||
|
@ -20,6 +20,12 @@ class LoggingLayer:
|
|||||||
self.loggers = {}
|
self.loggers = {}
|
||||||
logging.config.dictConfig(config)
|
logging.config.dictConfig(config)
|
||||||
|
|
||||||
|
# Note on __getitem__:
|
||||||
|
# __getitem__ overrides the functionality of the [] operator.
|
||||||
|
# That means this code:
|
||||||
|
# objinstance = LoggingLayer(...)
|
||||||
|
# objinstance[foo] calls LoggingLayer.__getitem__(foo)
|
||||||
|
# and returns the result.
|
||||||
def __getitem__(self, k):
|
def __getitem__(self, k):
|
||||||
logger = self.loggers.get(k)
|
logger = self.loggers.get(k)
|
||||||
if not logger:
|
if not logger:
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
from . import cli
|
from . import cli
|
||||||
|
from . import config
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@ -14,6 +15,9 @@ def main():
|
|||||||
else:
|
else:
|
||||||
user_interface.start_command_loop()
|
user_interface.start_command_loop()
|
||||||
|
|
||||||
|
def verify_config_is_valid():
|
||||||
|
pass
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
@ -72,7 +72,7 @@ class PageRootObject:
|
|||||||
# js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
|
# js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
|
||||||
else:
|
else:
|
||||||
# color = color
|
# color = color
|
||||||
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
|
pass
|
||||||
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
|
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
|
||||||
self.driver.execute_script(js, el)
|
self.driver.execute_script(js, el)
|
||||||
|
|
||||||
@ -135,7 +135,7 @@ class ArticlePage(PageRootObject):
|
|||||||
# select unique elements with high confidence.
|
# select unique elements with high confidence.
|
||||||
elements = {
|
elements = {
|
||||||
'main-window-content-text-id': 'mw-content-text',
|
'main-window-content-text-id': 'mw-content-text',
|
||||||
'article-title': 'firstHeading',
|
'article-title-id': 'firstHeading',
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, driver=None):
|
def __init__(self, driver=None):
|
||||||
@ -145,7 +145,7 @@ class ArticlePage(PageRootObject):
|
|||||||
"""
|
"""
|
||||||
Returns the article title.
|
Returns the article title.
|
||||||
"""
|
"""
|
||||||
heading = self.driver.find_element_by_id(ArticlePage.elements['article-title'])
|
heading = self.driver.find_element_by_id(ArticlePage.elements['article-title-id'])
|
||||||
return heading.text
|
return heading.text
|
||||||
|
|
||||||
def click_first_link(self):
|
def click_first_link(self):
|
||||||
@ -175,7 +175,30 @@ class ArticlePage(PageRootObject):
|
|||||||
the first link, for performance optimization.
|
the first link, for performance optimization.
|
||||||
"""
|
"""
|
||||||
main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
|
main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
|
||||||
paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
|
|
||||||
|
# Note on xpath (more advanced web automation tool):
|
||||||
|
# xpath is another language that allows you to concisely specify an
|
||||||
|
# element in the page (or more generally any xml or html structured
|
||||||
|
# text). You can define a hierarchical tree structure that an element
|
||||||
|
# must have, attributes that any of the nodes must have or not have,
|
||||||
|
# and even some more complex functionality. For example, you can
|
||||||
|
# say in xpath to give me all the links that are a child of the
|
||||||
|
# navigation menu. This xpath here looks for paragraph elements
|
||||||
|
# that fall under this structure:
|
||||||
|
#
|
||||||
|
# <div id="mw-content-text">
|
||||||
|
# <div class="mw-parser-output">
|
||||||
|
# <p>
|
||||||
|
# ...wikipedia article content...
|
||||||
|
#
|
||||||
|
# and does NOT have:
|
||||||
|
# <span><span id="coordinates">...</span></span>
|
||||||
|
#
|
||||||
|
# </p>
|
||||||
|
# </div>
|
||||||
|
# </div>
|
||||||
|
xpath_str = './div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]'
|
||||||
|
paragraphs = main_window.find_elements_by_xpath(xpath_str)
|
||||||
for p in paragraphs:
|
for p in paragraphs:
|
||||||
# Return code indicates the success status of _parse_paragraph().
|
# Return code indicates the success status of _parse_paragraph().
|
||||||
# In this case, an rc of True means that it was able to find a
|
# In this case, an rc of True means that it was able to find a
|
||||||
@ -243,8 +266,15 @@ class ArticlePage(PageRootObject):
|
|||||||
link_text = el.get_attribute('outerHTML')
|
link_text = el.get_attribute('outerHTML')
|
||||||
p_text = p.get_attribute('innerHTML')
|
p_text = p.get_attribute('innerHTML')
|
||||||
|
|
||||||
regex_str = '\(.*?\)' # Regular expression to extract the
|
# Note on regular expressions (advanced):
|
||||||
# text inside (not nested) parenthesis
|
# Regular expressions or regexes are another language for
|
||||||
|
# matching patterns in raw text (regex is blind to html structure).
|
||||||
|
# Regular expressions are notorious because they can be
|
||||||
|
# hard to understand and read but they are extremely
|
||||||
|
# expressive, i.e. you can convey a great deal of functionality
|
||||||
|
# in one line of code. This one below is among the simplest and
|
||||||
|
# just grabs text inside a pair of parenthesis.
|
||||||
|
regex_str = '\(.*?\)'
|
||||||
regex = re.compile(regex_str, flags=re.UNICODE)
|
regex = re.compile(regex_str, flags=re.UNICODE)
|
||||||
match = regex.search(p_text)
|
match = regex.search(p_text)
|
||||||
if not match:
|
if not match:
|
||||||
@ -258,11 +288,13 @@ class ArticlePage(PageRootObject):
|
|||||||
#
|
#
|
||||||
# Care must be taken with regular expressions as they are
|
# Care must be taken with regular expressions as they are
|
||||||
# user/developer unfriendly, hard-to-read, and unforgiving.
|
# user/developer unfriendly, hard-to-read, and unforgiving.
|
||||||
# For example, what happens when you try to match (<anything>)
|
# For example, what happens when you try to extract text inside parenthesis
|
||||||
# inside of (some words) some more words (even more words), you
|
# inside of "(some words) some more words (even more words)", you
|
||||||
# can match unpaired parenthesis and the computer will return
|
# can match unpaired parenthesis. This means "some more words" is indeed
|
||||||
# unexpected results. The code is quite dumb and does exactly
|
# inside a pair of parenthesis, the pair given by the entire string.
|
||||||
# what you tell it to.
|
# The code is quite dumb and does exactly what you tell it to.
|
||||||
|
# Often this leads to unexpected results because the computer
|
||||||
|
# is so literal.
|
||||||
match_text = match.group(0)
|
match_text = match.group(0)
|
||||||
match_idx = match.end(0)
|
match_idx = match.end(0)
|
||||||
if link_text in match_text:
|
if link_text in match_text:
|
||||||
@ -271,6 +303,17 @@ class ArticlePage(PageRootObject):
|
|||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# Side Teaser: This is a regular expression to match valid email
|
||||||
|
# addresses. It reads 1-or more alphanumeric characters (also _ and . and -)
|
||||||
|
# followed by a '@' followed by 1-or more alphanumeric characters
|
||||||
|
# (also . and -), followed by a period and between 2 and 6 lower-cased
|
||||||
|
# characters a-z (and . accepted).
|
||||||
|
#
|
||||||
|
# /^([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$/
|
||||||
|
#
|
||||||
|
# You can see how they get messy quick but one line of regex could
|
||||||
|
# do the same thing that would take 100 lines of imperative code.
|
||||||
|
|
||||||
def _is_link_a_footnote(self, el):
|
def _is_link_a_footnote(self, el):
|
||||||
# Some links are anchors to footnotes, e.g. [1] that points to a source
|
# Some links are anchors to footnotes, e.g. [1] that points to a source
|
||||||
# at the bottom of the page. These aren't valid links for our purpose
|
# at the bottom of the page. These aren't valid links for our purpose
|
||||||
@ -302,6 +345,8 @@ class ArticlePage(PageRootObject):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def _is_not_wikipedia(self, el):
|
def _is_not_wikipedia(self, el):
|
||||||
|
# Some links point to websites outside of wikipedia, we skip those
|
||||||
|
# with this.
|
||||||
href = el.get_attribute('href')
|
href = el.get_attribute('href')
|
||||||
if 'wikipedia.org' not in href:
|
if 'wikipedia.org' not in href:
|
||||||
return True
|
return True
|
||||||
|
Loading…
Reference in New Issue
Block a user