wikicrawl/app/model.py

# model module contains the business logic of the program. Notice
# the command-line interface contains no business logic and only
# has functionality to call on the model. Similarly, the page
# objects define no specific functionality for doing the wiki crawl
# but only provide general utility methods that are called upon
# to implement the wiki crawl. This is a separation of concerns
# and keeps the logic organized and separated.

import logging
import os
import time

from . import browser
from . import config
from . import dal
from . import log
from . import pages

class Model:
    def __init__(self):
        self._webdriver = None

    @property
    def webdriver(self):
        # The way this works is when an object instance of class/type Model
        # is called with x.webdriver, Model runs webdriver(). In our case,
        # the webdriver() method checks if a private variable self._webdriver
        # exists and if it isn't, asks for a new selenim object. The result
        # is that this will on-demand create a browser. If one exists, it will
        # use the one that exists and if one doesn't exists, it will create
        # one and use that. External code can rely on self.webdriver
        # always existing with or without knowing if it exists because if it
        # hasn't been created yet then it will be created on-the-fly.
        if not self._webdriver:
            self._webdriver = browser.create_webdriver(config.obj.WEBDRIVER_BROWSER)
            page_api = pages.LandingPage(self.webdriver)
            page_api.goto_landing_page()
        return self._webdriver

    def open_browser(self):
        x = self.webdriver # Request the browser open immediately.
                           # Without this, the Model object will
                           # be created on-demand (as defined in
                           # the Model class). This means that the
                           # web browser will not open until
                           # a command is typed in. But because we
                           # request the webdriver right here, Model
                           # creates it and then it is re-used later
                           # in the application.

    def do_random_page(self):
        """
        Select a random page and repeatedly click the first link until
        we reach the article on philosophy. Sometimes, the driver encounters
        a loop and will never reach the page and sometimes the parser
        fails and we fail to programmatically implement what we're trying to
        do correctly.
        """
        # The following 3 lines include the functionality
        # for the Landing page (select language).

        # This line creates a new object (page_api) which is an instance
        # of type class pages.LandingPage. LandingPage is a variable
        # containing a class definition that is located in the pages module.
        # We pass self.webdriver as arguments into the LandingPage.__init__
        # constructor.
        page_api = pages.LandingPage(self.webdriver)

        # This line calls the page_api object's (an instance of
        # pages.LandingPage type) method goto_landing_page. 
        page_api.goto_landing_page()

        # Similarly, this line calls the select_language method
        # and passes in values from our runtime configuration.
        # In this case, we have made the language a parameter
        # that you can pass into the program, i.e. you can run it
        # for English or Spanish or Russian or what have you.
        page_api.select_language(config.obj.PAGE_LANGUAGE)

        # Main page: next 2 lines

        # At this point, we have clicked a link and changes the page. We
        # re-create our page interface as a new object which is of
        # a different class and includes distinct code for working with
        # the page. In this case, we delete page_api and re-create it
        # as an object of type pages.MainPage. Again, we pass in
        # self.webdriver as an object of the selenium webdriver interface.
        # The page_api calls methods on this webdriver to make the web
        # browser do various things like click links or extract text.
        page_api = pages.MainPage(self.webdriver)

        # We call pages.MainPage.goto_random_article() to perform
        # the action we're trying to invoke.
        page_api.goto_random_article()

        # Article page
        pages_visited = []
        while True:
            page_api = pages.ArticlePage(self.webdriver)

            title = page_api.get_title()
            logging.debug('visited page: %s' % title)
            if title in pages_visited:
                logging.info('encountered loop at page = %s' % title)
                break
            if title == 'Philosophy':
                logging.info('made it to philosophy in %s pages' % len(pages_visited))
                pages_visited.append(title)
                break
            pages_visited.append(title)

            rc = page_api.click_first_link()
            if not rc:
                logging.warn('failure: unable to continue (perhaps no valid links?)')
                break
            print()
lots of documentation and safer functionality 2017-08-22 17:50:33 -06:00			`# model module contains the business logic of the program. Notice`
			`# the command-line interface contains no business logic and only`
			`# has functionality to call on the model. Similarly, the page`
			`# objects define no specific functionality for doing the wiki crawl`
			`# but only provide general utility methods that are called upon`
			`# to implement the wiki crawl. This is a separation of concerns`
			`# and keeps the logic organized and separated.`

first working version 2017-08-17 01:27:05 -06:00			`import logging`
			`import os`
			`import time`

			`from . import browser`
lots of documentation and safer functionality 2017-08-22 17:50:33 -06:00			`from . import config`
first working version 2017-08-17 01:27:05 -06:00			`from . import dal`
lots of documentation and safer functionality 2017-08-22 17:50:33 -06:00			`from . import log`
first working version 2017-08-17 01:27:05 -06:00			`from . import pages`

			`class Model:`
			`def __init__(self):`
			`self._webdriver = None`

			`@property`
			`def webdriver(self):`
lots of documentation and safer functionality 2017-08-22 17:50:33 -06:00			`# The way this works is when an object instance of class/type Model`
			`# is called with x.webdriver, Model runs webdriver(). In our case,`
			`# the webdriver() method checks if a private variable self._webdriver`
			`# exists and if it isn't, asks for a new selenim object. The result`
			`# is that this will on-demand create a browser. If one exists, it will`
			`# use the one that exists and if one doesn't exists, it will create`
			`# one and use that. External code can rely on self.webdriver`
			`# always existing with or without knowing if it exists because if it`
			`# hasn't been created yet then it will be created on-the-fly.`
first working version 2017-08-17 01:27:05 -06:00			`if not self._webdriver:`
lots of documentation and safer functionality 2017-08-22 17:50:33 -06:00			`self._webdriver = browser.create_webdriver(config.obj.WEBDRIVER_BROWSER)`
cli and multi language support 2017-08-17 01:45:07 -06:00			`page_api = pages.LandingPage(self.webdriver)`
			`page_api.goto_landing_page()`
first working version 2017-08-17 01:27:05 -06:00			`return self._webdriver`

lots of documentation and safer functionality 2017-08-22 17:50:33 -06:00			`def open_browser(self):`
			`x = self.webdriver # Request the browser open immediately.`
			`# Without this, the Model object will`
			`# be created on-demand (as defined in`
			`# the Model class). This means that the`
			`# web browser will not open until`
			`# a command is typed in. But because we`
			`# request the webdriver right here, Model`
			`# creates it and then it is re-used later`
			`# in the application.`

first working version 2017-08-17 01:27:05 -06:00			`def do_random_page(self):`
lots of documentation and safer functionality 2017-08-22 17:50:33 -06:00			`"""`
			`Select a random page and repeatedly click the first link until`
			`we reach the article on philosophy. Sometimes, the driver encounters`
			`a loop and will never reach the page and sometimes the parser`
			`fails and we fail to programmatically implement what we're trying to`
			`do correctly.`
			`"""`
			`# The following 3 lines include the functionality`
			`# for the Landing page (select language).`

			`# This line creates a new object (page_api) which is an instance`
			`# of type class pages.LandingPage. LandingPage is a variable`
			`# containing a class definition that is located in the pages module.`
			`# We pass self.webdriver as arguments into the LandingPage.__init__`
			`# constructor.`
first working version 2017-08-17 01:27:05 -06:00			`page_api = pages.LandingPage(self.webdriver)`
lots of documentation and safer functionality 2017-08-22 17:50:33 -06:00
			`# This line calls the page_api object's (an instance of`
			`# pages.LandingPage type) method goto_landing_page.`
first working version 2017-08-17 01:27:05 -06:00			`page_api.goto_landing_page()`

lots of documentation and safer functionality 2017-08-22 17:50:33 -06:00			`# Similarly, this line calls the select_language method`
			`# and passes in values from our runtime configuration.`
			`# In this case, we have made the language a parameter`
			`# that you can pass into the program, i.e. you can run it`
			`# for English or Spanish or Russian or what have you.`
			`page_api.select_language(config.obj.PAGE_LANGUAGE)`

			`# Main page: next 2 lines`

			`# At this point, we have clicked a link and changes the page. We`
			`# re-create our page interface as a new object which is of`
			`# a different class and includes distinct code for working with`
			`# the page. In this case, we delete page_api and re-create it`
			`# as an object of type pages.MainPage. Again, we pass in`
			`# self.webdriver as an object of the selenium webdriver interface.`
			`# The page_api calls methods on this webdriver to make the web`
			`# browser do various things like click links or extract text.`
first working version 2017-08-17 01:27:05 -06:00			`page_api = pages.MainPage(self.webdriver)`
lots of documentation and safer functionality 2017-08-22 17:50:33 -06:00
			`# We call pages.MainPage.goto_random_article() to perform`
			`# the action we're trying to invoke.`
first working version 2017-08-17 01:27:05 -06:00			`page_api.goto_random_article()`

			`# Article page`
			`pages_visited = []`
			`while True:`
			`page_api = pages.ArticlePage(self.webdriver)`

			`title = page_api.get_title()`
			`logging.debug('visited page: %s' % title)`
			`if title in pages_visited:`
			`logging.info('encountered loop at page = %s' % title)`
			`break`
			`if title == 'Philosophy':`
			`logging.info('made it to philosophy in %s pages' % len(pages_visited))`
			`pages_visited.append(title)`
			`break`
			`pages_visited.append(title)`

			`rc = page_api.click_first_link()`
			`if not rc:`
			`logging.warn('failure: unable to continue (perhaps no valid links?)')`
			`break`
			`print()`