diff --git a/app/__init__.py b/app/__init__.py deleted file mode 100644 index ca1a8a3..0000000 --- a/app/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -# The __init__.py file signals to the python interpreter that the -# app directory is a package. A package is a special module that -# contains other modules. Each file is a module (browser, cli, etc.) -# and the "app" package is a module that contains other modules. - -# The "app" module exports the stuff exposed here. We export -# app.init() as a reference to app.config.init() and app.main -# as a reference to app.cli.main - -from .config import init -from .cli import main - diff --git a/app/log.py b/app/log.py deleted file mode 100644 index a62fb3d..0000000 --- a/app/log.py +++ /dev/null @@ -1,7 +0,0 @@ -import logging - -from . import config - -def init_logging(): - logging.basicConfig(level=config.obj.LOG_LEVEL) - diff --git a/launcher.py b/launcher.py index a571da4..facdc09 100644 --- a/launcher.py +++ b/launcher.py @@ -1,9 +1,9 @@ -import app +import wikicrawl import settings # Inject the settings.DefaultSettings object into the # app and start running the program. -app.init(settings.DefaultSettings) -app.main() +wikicrawl.init(settings.DefaultSettings) +wikicrawl.main() input(' to exit') diff --git a/settings.py b/settings.py index c2e9608..767b009 100644 --- a/settings.py +++ b/settings.py @@ -4,28 +4,107 @@ # not hard-coded into the application. For example, some users may want # to run this program in English while others may want to run in Spanish. # The way this works is we specify those variables external from the -# application (here) and pass them into the application (app.config module). -# The application then references app.config.obj to access the variables +# application (here) and pass them into the application (wikicrawl.config module). +# The application then references wikicrawl.config.obj to access the variables # passed in from here. +import colorlog import logging +import logging.config class DefaultSettings: + # Filepath parameters - THESE MUST EXIST OR PROGRAM WILL NOT RUN!! + LOG_FILENAME = '/tmp/wikicrawl.log' + SQLITE_DBFILE = '/home/mathew/.wikicrawler.db' + # Application Parameters - LOG_LEVEL = logging.INFO - DO_BREAKPOINTS = True + DO_BREAKPOINTS = False PAGE_DELAY = 0 # Web Driver Parameters WEBDRIVER_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)' - WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox' + + # Requested browser and webdriver dependencies are required for this to work. + # This means you need to have installed on your system: + # Chrome + WebDriver for Chrome + # Firefox + geckodriver for Firefox + # phantomjs for phantom + WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox', 'phantom' # Wikipedia Parameters PAGE_BASE_URL = 'https://www.wikipedia.org/' - PAGE_LANGUAGE = 'English' - # PAGE_LANGUAGE = 'Español' - # PAGE_LANGUAGE = 'Русский' - # Data Layer Parameters - SQLITE_DBFILE = '/home/mathew/.wikicrawler.db' + # Supported Languages so far: + # German, English, Spanish, French, Italian, Portuguese, Polish, Russian + # 'de', 'en', 'es', 'fr', 'it', 'pl', 'pt', 'ru' + PAGE_LANGUAGE = 'en' + + # API Keys + YANDEX_API_KEY = 'trnsl.1.1.20170825T194642Z.26862b9dd4c1a755.9490ed28de448ff67522c2854f262eff05ec0dc3' + + # Logging Parameters + LOG_SETTINGS = { + 'version': 1, # version is always 1 + 'formatters': { + 'colored': { + '()': 'colorlog.ColoredFormatter', + 'format': '%(log_color)s%(levelname)-8s%(reset)s:%(log_color)s%(name)-5s%(reset)s:%(blue)s%(message)s' + }, + 'basic': { + '()': 'logging.Formatter', + 'format': '%(levelname)s:%(name)s:%(asctime)s:%(message)s' + } + }, + 'handlers': { + 'stderr': { + 'class': 'logging.StreamHandler', + # The handler level will override the logger level if higher. + # That is, if the logger level is set to pass through DEBUG + # and higher and the handler is set to only pass through WARNING + # and higher, DEBUG messages will not pass through to this loggers + # handler. You can configure multiple handlers for any logger so + # for example you could log WARNINGS and ERRORS to a file but + # not save all the DEBUG messages. + 'level': logging.DEBUG, + 'formatter': 'colored' + }, + 'file': { + 'class': 'logging.handlers.RotatingFileHandler', + 'level': logging.INFO, + 'formatter': 'basic', + 'filename': LOG_FILENAME, + 'maxBytes': 32768, + 'backupCount': 3 + } + }, + 'loggers': { + # Root Logger + '': { + 'level': logging.DEBUG, + 'handlers': ['file'], + }, + 'main': { + 'level': logging.DEBUG, + 'handlers': ['stderr'], + 'propagate': False + }, + 'model': { + 'level': logging.DEBUG, + 'handlers': ['stderr'], + 'propagate': True + }, + 'cli': { + 'level': logging.DEBUG, + 'handlers': ['stderr'], + 'propagate': False + }, + 'pages': { + 'level': logging.INFO, + 'handlers': ['stderr'], + 'propagate': False + } + } + } + + diff --git a/setup.py b/setup.py index e69de29..789c1f2 100644 --- a/setup.py +++ b/setup.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python +# setup.py is the install script for this application. This will download +# required third-party dependencies and package the app. You can also +# install the application system-wide. + +from setuptools import setup + +__project__ = 'wikicrawl' +# If you're looking for a versioning scheme, one revered pattern +# can be read about at http://semver.org +__version__ = '0.9.0' + +setup(name = __project__, + version = __version__, + description = '', + author = '', + author_email = '', + url = '', + install_requires = ('yandex.translate', + 'selenium', + ), + packages = ('wikicrawl',)) + diff --git a/wikicrawl/.dal.py.swp b/wikicrawl/.dal.py.swp new file mode 100644 index 0000000..d9605c8 Binary files /dev/null and b/wikicrawl/.dal.py.swp differ diff --git a/wikicrawl/__init__.py b/wikicrawl/__init__.py new file mode 100644 index 0000000..55caacd --- /dev/null +++ b/wikicrawl/__init__.py @@ -0,0 +1,12 @@ +# The __init__.py file signals to the python interpreter that the +# app directory is a package. A package is a special module that +# contains other modules. Each file is a module (browser, cli, etc.) +# and the "wikicrawl" package is a module that contains other modules. + +# The wikicrawl package, which is a module, exports the stuff exposed here. +# We export config.init() as a reference to wikicrawl.config.init() and +# wikicrawl.main as a reference to wikicrawl.cli.main + +from .config import init +from .main import main + diff --git a/wikicrawl/assets/languages.py b/wikicrawl/assets/languages.py new file mode 100644 index 0000000..7d56a0b --- /dev/null +++ b/wikicrawl/assets/languages.py @@ -0,0 +1,39 @@ + +LANGUAGES = { + 'az': '', + 'be': '', + 'bg': '', + 'ca': '', + 'cs': '', + 'da': '', + 'de': 'Deutsch', + 'el': '', + 'en': 'English', + 'es': 'Español', + 'et': '', + 'fi': '', + 'fr': 'Français', + 'hr': '', + 'hu': '', + 'hy': '', + 'it': 'Italiano', + # 'ja': '日本語', -- no japanese in yandex + 'lt': '', + 'lv': '', + 'mk': '', + 'nl': '', + 'no': '', + 'pl': 'Polski', + 'pt': 'Português', + 'ro': '', + 'ru': 'Русский', + 'sk': '', + 'sl': '', + 'sq': '', + 'sr': '', + 'sv': '', + 'tr': '', + 'uk': '', + # 'zh': '中文' -- no chinese +} + diff --git a/app/browser.py b/wikicrawl/browser.py similarity index 73% rename from app/browser.py rename to wikicrawl/browser.py index 59b4cc0..5fc9357 100644 --- a/app/browser.py +++ b/wikicrawl/browser.py @@ -11,9 +11,9 @@ import selenium import selenium.webdriver -import logging from . import config +from . import log # This function has a parameter (driver) that passes in a value. In this case, # this driver variable defaults to the string 'chrome'. The code can call @@ -25,9 +25,17 @@ def create_webdriver(driver='chrome'): return create_webdriver_chrome() elif driver == 'firefox': return create_webdriver_firefox() + elif driver == 'phantom': + return create_webdriver_phantom() + else: + log.LOGGER('browser').error('unable to handle webdriver request: %s' % driver) + return def create_webdriver_firefox(): - pass + profile = selenium.webdriver.FirefoxProfile() + profile.set_preference("general.useragent.override", config.obj.WEBDRIVER_USER_AGENT) + driver = selenium.webdriver.Firefox(profile) + return driver def create_webdriver_chrome(): opt = selenium.webdriver.chrome.options.Options() @@ -35,3 +43,7 @@ def create_webdriver_chrome(): driver = selenium.webdriver.Chrome(chrome_options = opt) return driver +def create_webdriver_phantom(): + driver = selenium.webdriver.PhantomJS() + return driver + diff --git a/app/cli.py b/wikicrawl/cli.py similarity index 74% rename from app/cli.py rename to wikicrawl/cli.py index 67b8ff9..b2890ef 100644 --- a/app/cli.py +++ b/wikicrawl/cli.py @@ -1,37 +1,42 @@ -#!/usr/bin/env python -# The command-line interface module creates a interface for +# The command-line interface module creates an interface for # interacting with the python program (wikicrawl). This is an implementation # of the baker demo shown previously. The user can type in commands to # make the program do things. import baker -import logging -import readline # Needed for command history and arrows to work import sys + +if sys.platform == 'linux': + import readline # Needed for command history and arrows to work + +from . import log from . import model from . import config # Problem pages: # Decision (from politics) # Malaysia (goes inside parenthesis) +# Soft-sediment_deformation_structures (doesn't find link) +# Chemicals (loops at philosophical) commander = baker.Baker() -def main(): - user_interface = InteractiveInterface() - - if len(sys.argv) > 1: # Command line arguments were passed in - # command-line when invoking python - user_interface.run(sys.argv) - else: - user_interface.start_command_loop() - class InteractiveInterface: def __init__(self): + # Instantiate the variable self.model as an object + # of instance of the Model class defined in the model + # module. model.Model refers to the Model class in the + # model module and this line creates a new variable (self.model) + # which is a variable that is an instance of Model, i.e. + # it has the type Model and has Model.methods() available + # to it. + # + # self.model is a variable that is attached to the instance/object + # returned by this constructor that has the type InteractiveInterface. self.model = model.Model() - def run(self, args, main=True): + def run_command(self, args, main=True): """ Runs the command-line interface for a single command. @@ -45,13 +50,13 @@ class InteractiveInterface: commander.run(argv=args, main=True, help_on_error=True, instance=self) except baker.CommandError as ex: - logging.warn('incorrect user input: %s' % ex) + log.LOGGER['cli'].warn('incorrect user input: %s' % ex) commander.usage() except baker.TopHelp as ex: commander.usage() except Exception as ex: - logging.error('caught general exception!!') - print(type(ex), ex) + log.LOGGER['cli'].error('caught general exception!!') + log.LOGGER['cli'].error(type(ex), ex) def start_command_loop(self): """ @@ -76,12 +81,13 @@ class InteractiveInterface: # to NOT drop to a newline after printing # in the terminal. Instead, let the user # type their command on the same line as - # our printed '$ '. + # the printed '$ '. try: inp = input() except EOFError: # +D will send "End Line" and exit the command loop break - # Note in arguments (mg): + + # Note on "arguments" (mg): # Whenever a program is run in windows or *nix, the operating # system passes in the command string that was used to invoke # the program. You can append data in that command to configure @@ -91,16 +97,16 @@ class InteractiveInterface: # software but you can also pass in an argument. You can # alternatively run "python launcher.py ..." # and the operating system will provide the values into - # the process that is running. + # the process that is running as variables. # # In a real world use case, many commands provide switches to # adjust what the program does. For example, # # The command: - # find music -iname "*justin*bieber*" + # find music -name "*justin*bieber*" # runs the "find" program and asks to find all the filenames that match the # pattern *justin*bieber* in the "music" directory. - # (music, -iname, "*justin*biever*") are argument parameters + # (music, -name, "*justin*biever*") are argument parameters # that are passed into the program. The program is coded to # parse and interpret these values and execute differently based # on the values passed in. This is one way to pass in information @@ -123,14 +129,21 @@ class InteractiveInterface: # would be C:\Users\mguest\launcher.py. # What this method (start_command_loop()) does is provide a - # REPL which is a + # REPL shell which is a # read-eval-print-loop. It repeatedly asks the user for an # input (read), evaluates that input into an action (evaluate), # give the user some feedback (print), and start the process - # over again (loop). When you call "python", you are given a python - # process that gives you a REPL interactive shell. The way + # over again (loop). When you call just "python", you are loading a + # program that gives you a REPL interactive shell. The way # this wikicrawl app is implemented gives the user a REPL # that has commands to interact with wikipedia pages. + + # Because we take in the input as a single string, we do + # a transformation to turn something like "do_random_page 5" + # into ["launcher.py", "do_random_page", "5"] which is how + # the arguments array would have been created if it were + # passed in the initial command instead of typed and interpretted + # as input as is done here. args = [sys.argv[0], ] + inp.split() # The user can at any point in the command pass the argument @@ -146,40 +159,42 @@ class InteractiveInterface: # python launcher.py do_random_page --help # You will see the program spit out the heredoc below the # do_random_page method defined below. - if '--help' in args: args.remove('--help') try: print('command usage:') commander.usage(args[1]) - return except Exception as ex: print(type(ex), ex) continue - self.run(args, main=False) + self.run_command(args, main=False) @commander.command - def do_random_page(self): + def play_random_page(self): """ Instructs the wikicrawl application to play the game on a random article. """ - self.model.do_random_page() + self.model.play_random_page() @commander.command - def do_n_pages(self, n): + def play_multiple(self, n): """ Plays the wikicrawl game -times. """ try: n = int(n) except ValueError as ex: - logging.warn('failed to process "%s" as a parameter' % n) + log.LOGGER['cli'].warn('failed to process "%s" as a parameter' % n) return False for i in range(n): - self.model.do_random_page() + self.model.play_random_page() -if __name__ == '__main__': - main() + @commander.command + def exit(self): + """ + Immediately exit the program. + """ + sys.exit(0) diff --git a/app/config.py b/wikicrawl/config.py similarity index 100% rename from app/config.py rename to wikicrawl/config.py diff --git a/app/dal.py b/wikicrawl/dal.py similarity index 100% rename from app/dal.py rename to wikicrawl/dal.py diff --git a/wikicrawl/log.py b/wikicrawl/log.py new file mode 100644 index 0000000..a1dbc37 --- /dev/null +++ b/wikicrawl/log.py @@ -0,0 +1,33 @@ +# log module is a wrapper around third-party colorlog library +# and provides an application-level interface to a logging system. + +import colorlog +import logging + +from . import config + +# Default python log severity levels: +# CRITICAL +# ERROR +# WARNING +# INFO +# DEBUG + +LOGGER = None + +class LoggingLayer: + def __init__(self, config): + self.loggers = {} + logging.config.dictConfig(config) + + def __getitem__(self, k): + logger = self.loggers.get(k) + if not logger: + logger = logging.getLogger(k) + self.loggers[k] = logger + return logger + +def init_logging(): + global LOGGER + LOGGER = LoggingLayer(config.obj.LOG_SETTINGS) + diff --git a/wikicrawl/main.py b/wikicrawl/main.py new file mode 100644 index 0000000..a7a39f0 --- /dev/null +++ b/wikicrawl/main.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python + +import sys + +from . import cli +from . import util + +def main(): + user_interface = cli.InteractiveInterface() + + if len(sys.argv) > 1: # Command line arguments were passed in + # command-line when invoking python + user_interface.run_command(sys.argv) + else: + user_interface.start_command_loop() + +if __name__ == '__main__': + main() + diff --git a/app/model.py b/wikicrawl/model.py similarity index 70% rename from app/model.py rename to wikicrawl/model.py index 89810d1..5be3f54 100644 --- a/app/model.py +++ b/wikicrawl/model.py @@ -6,7 +6,6 @@ # to implement the wiki crawl. This is a separation of concerns # and keeps the logic organized and separated. -import logging import os import time @@ -15,10 +14,12 @@ from . import config from . import dal from . import log from . import pages +from . import util class Model: def __init__(self): self._webdriver = None + self._translated_philosophy = None @property def webdriver(self): @@ -37,6 +38,16 @@ class Model: page_api.goto_landing_page() return self._webdriver + @property + def translated_philosophy(self): + # This translates 'philosophy' to the target language with only 1 api call. + if config.obj.PAGE_LANGUAGE == 'en': + self._translated_philosophy = 'philosophy' + elif not self._translated_philosophy: + text = util.translate_text('en', config.obj.PAGE_LANGUAGE, 'philosophy') + self._translated_philosophy = text + return self._translated_philosophy + def open_browser(self): x = self.webdriver # Request the browser open immediately. # Without this, the Model object will @@ -48,7 +59,7 @@ class Model: # creates it and then it is re-used later # in the application. - def do_random_page(self): + def play_random_page(self): """ Select a random page and repeatedly click the first link until we reach the article on philosophy. Sometimes, the driver encounters @@ -93,25 +104,50 @@ class Model: # the action we're trying to invoke. page_api.goto_random_article() - # Article page + # Article pages pages_visited = [] + + # We just need translated_title to exist + translated_title = None while True: page_api = pages.ArticlePage(self.webdriver) + # Get the article title (and translate if necessary) title = page_api.get_title() - logging.debug('visited page: %s' % title) + if config.obj.PAGE_LANGUAGE != 'en': + translated_title = util.translate_text(config.obj.PAGE_LANGUAGE, 'en', title) + log.LOGGER['model'].info('visited page: %s (%s)' % (title, translated_title)) + else: + log.LOGGER['model'].info('visited page: %s' % title) + + # Check for page loops (have we already visisted this page?) if title in pages_visited: - logging.info('encountered loop at page = %s' % title) + log.LOGGER['model'].info('encountered loop at page = %s' % title) break - if title == 'Philosophy': - logging.info('made it to philosophy in %s pages' % len(pages_visited)) + + # Check if we reached the article on philosophy + if self._is_article_on_philosophy(title, translated_title): + log.LOGGER['model'].info('made it to philosophy in %s pages' % len(pages_visited)) pages_visited.append(title) break + + # Store the result of what articles have been navigated pages_visited.append(title) rc = page_api.click_first_link() if not rc: - logging.warn('failure: unable to continue (perhaps no valid links?)') + log.LOGGER['model'].warn('failure: unable to continue (perhaps no valid links?)') break print() + def _is_article_on_philosophy(self, title, translated_title): + """ + Checks both the original title and the translated (to english) title to + see if they seem to be the page on philosophy. + """ + if title.lower() == self.translated_philosophy.lower(): + return True + if translated_title and translated_title.lower() == 'philosophy': + return True + return False + diff --git a/app/pages.py b/wikicrawl/pages.py similarity index 72% rename from app/pages.py rename to wikicrawl/pages.py index 658d352..bae4b23 100644 --- a/app/pages.py +++ b/wikicrawl/pages.py @@ -1,22 +1,16 @@ -# Pages module defines classes for interacting with wikipedia pages. +# pages module defines classes for interacting with wikipedia pages. # There are separate classes defined for each page with their own # defined methods for performing certain actions. -import logging import re import selenium import time from . import browser from . import config - -def breakpoint(): - """ - If DO_BREAKPOINTS is switched on, this will pause program - execution and wait for the user to press enter to continue. - """ - if config.obj.DO_BREAKPOINTS: - input('Breakpoint here. to continue...') +from . import log +from . import util +from .assets.languages import LANGUAGES class PageRootObject: """ @@ -28,7 +22,7 @@ class PageRootObject: In here are some re-used methods to click links and highlight elements in the browser. """ - def __init__(self, driver=None): + def __init__(self, driver): """ Object constructor for initializing the instance of this class with internal variables needed. @@ -37,10 +31,7 @@ class PageRootObject: driver: Reference to the selenium webdriver object that is used to interface with the web browser. """ - if not driver: - self.driver = browser.create_webdriver() - else: - self.driver = driver + self.driver = driver def click(self, el): """ @@ -49,11 +40,11 @@ class PageRootObject: Args: el: selenium element to be clicked. Typically an anchor - html link in the page. + html link in the webpage. """ self.highlight(el, 'red') time.sleep(config.obj.PAGE_DELAY) - breakpoint() + util.breakpoint() el.click() def highlight(self, el, color): @@ -68,36 +59,55 @@ class PageRootObject: color: background color to highlight. Input can be one of 'red', 'blue', or hex code such as '#ffffff'. """ - # Note: The way hex codes work is there are 1 byte (2 hex characters) + # Note: The way hex codes work is there is 1 byte (2 hex characters) # for every color. #RRGGBB for (red, green, blue). This can be thought # of as an integer 0-255 for red, green, and blue in base-16 hexadecimal. + # For example, #ff0000 is bright red while #002f00 is light green + # and #ffff00 is full yellow. if color == 'red': - js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292' + color = '#ff9292' + # js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292' elif color == 'blue': - js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff' + color = '#9292ff' + # js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff' else: + # color = color js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color + js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color self.driver.execute_script(js, el) +# Note: This is the syntax for class inheritance. LandingPage is a new type of object that inherits +# everything from the PageRootObject type. With this, you can call LandingPage.highlight() which +# is a method defined in PageRootObject. class LandingPage(PageRootObject): """ Interface for working with the wikipedia.org landing page. This page has links to select a language and go to the respective wikipedia root page. """ - # Note: This is the LandingPage() object constructor. All it does right now is + # Note: This is the LandingPage() class constructor. The constructor is a method + # that is executed when a new object of this class is created. All it does right now is # reference the parent (PageRootObject) constructor method and call it. This - # calls PageRootObject.__init__(driver) which makes the web driver available + # calls PageRootObject.__init__(driver) which then makes the web driver available # in the object instance. def __init__(self, driver=None): super().__init__(driver) def goto_landing_page(self): + """ + Navigates the browser to www.wikipedia.org + """ self.driver.get(config.obj.PAGE_BASE_URL) def select_language(self, language): - link = self.driver.find_element_by_partial_link_text(language) - self.click(link) + lang_text = LANGUAGES.get(language) + try: + link = self.driver.find_element_by_partial_link_text(lang_text) + self.click(link) + return True + except selenium.common.exceptions.NoSuchElementException as ex: + logging.warn('failed to find language: %s as %s' % (language, lang_text)) + return False class MainPage(PageRootObject): """ @@ -121,7 +131,7 @@ class ArticlePage(PageRootObject): # These are used to locate html elements in the web browser. There are many # ways to locate elements but one of the best if available is locating by id. It's # not enforced but the html specification mandates that element id's are unique - # so if you can select by id in a semanticly correct web page, you can correctly + # so if you can select by id in a semantically correct web page, you can correctly # select unique elements with high confidence. elements = { 'main-window-content-text-id': 'mw-content-text', @@ -132,10 +142,18 @@ class ArticlePage(PageRootObject): super().__init__(driver) def get_title(self): + """ + Returns the article title. + """ heading = self.driver.find_element_by_id(ArticlePage.elements['article-title']) return heading.text def click_first_link(self): + """ + Attempts to click the first valid link in the article. Some work is + done to skip over certain links but the implementation breaks in some + edge cases. It's close but not perfect for every article text. + """ return self._iterate_paragraphs() # Note: Here this method has it's name prepended with a single underscore. @@ -159,33 +177,51 @@ class ArticlePage(PageRootObject): main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id']) paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]') for p in paragraphs: + # Return code indicates the success status of _parse_paragraph(). + # In this case, an rc of True means that it was able to find a + # link and we stop going through paragraphs. rc = self._parse_paragraph(p) if rc: return True def _parse_paragraph(self, p): + """ + Attempts to find a valid link in the paragraph element sent in. + + Args: + p: Reference to selenium paragraph element. This is a paragraph + taken from the article. + """ links = p.find_elements_by_xpath('.//a') if len(links) == 0: return False for link in links: - logging.debug('processing link: %s' % link.text) + log.LOGGER['pages'].debug('processing link: %s' % link.text) if not self._is_valid_link(p, link): - logging.debug('skipping link inside parenthesis: %s' % link.text) + log.LOGGER['pages'].debug('skipping link inside parenthesis: %s' % link.text) self.highlight(link, 'blue') continue self.highlight(link, 'red') - logging.info('selected link: %s' % link.text) + log.LOGGER['pages'].info('selected link: %s' % link.text) self.click(link) return True def _is_valid_link(self, p, el): - a = self._is_link_in_parenthesis(p, el) - b = self._is_link_a_footnote(el) - c = self._is_link_pronounciation(el) - d = self._is_link_audio(el) - if not a and not b and not c and not d: - return True - return False + """ + Returns if the implementation decides to skip this link. You can + see the reasons we invalidate and skip a link here. If it's + inside parenthesis, is a footnote, is a pronounciation guide or + audio link, we choose to skip it. + """ + if self._is_link_in_parenthesis(p, el): + return False + if self._is_link_a_footnote(el): + return False + if self._is_link_pronounciation(el): + return False + if self._is_link_audio(el): + return False + return True def _is_link_in_parenthesis(self, p, el): """ @@ -198,7 +234,7 @@ class ArticlePage(PageRootObject): # certain links and usually avoid links inside parenthetical # notes. Some edge cases are nested parenthesis, links with # non-english characters (which are displayed with a tree - # of elements in the html rather than a simply link). And + # of elements in the html rather than a simple link). And # sometimes, the link inside the parenthesis may be a valid # target. I've made it so that skipped links show up as blue # and determined-valid links highlight as red. diff --git a/wikicrawl/util.py b/wikicrawl/util.py new file mode 100644 index 0000000..4a87ff6 --- /dev/null +++ b/wikicrawl/util.py @@ -0,0 +1,23 @@ +# util module contains utility functions that can be common or shared +# between the other modules. + +import yandex_translate + +from . import config + +def breakpoint(): + """ + If DO_BREAKPOINTS is switched on, this will pause program + execution and wait for the user to press enter to continue. + """ + if config.obj.DO_BREAKPOINTS: + input('BREAKPOINT hit. to continue...') + +def translate_text(source_language, target_language, text): + translate = yandex_translate.YandexTranslate(config.obj.YANDEX_API_KEY) + if not source_language: + source_language = translate.detect(text) + lang_direction = '%s-%s' % (source_language, target_language) + result = translate.translate(text, lang_direction) + return result['text'][0] +