wikicrawl/app/model.py

118 lines
5.1 KiB
Python
Raw Normal View History

# model module contains the business logic of the program. Notice
# the command-line interface contains no business logic and only
# has functionality to call on the model. Similarly, the page
# objects define no specific functionality for doing the wiki crawl
# but only provide general utility methods that are called upon
# to implement the wiki crawl. This is a separation of concerns
# and keeps the logic organized and separated.
2017-08-17 01:27:05 -06:00
import logging
import os
import time
from . import browser
from . import config
2017-08-17 01:27:05 -06:00
from . import dal
from . import log
2017-08-17 01:27:05 -06:00
from . import pages
class Model:
def __init__(self):
self._webdriver = None
@property
def webdriver(self):
# The way this works is when an object instance of class/type Model
# is called with x.webdriver, Model runs webdriver(). In our case,
# the webdriver() method checks if a private variable self._webdriver
# exists and if it isn't, asks for a new selenim object. The result
# is that this will on-demand create a browser. If one exists, it will
# use the one that exists and if one doesn't exists, it will create
# one and use that. External code can rely on self.webdriver
# always existing with or without knowing if it exists because if it
# hasn't been created yet then it will be created on-the-fly.
2017-08-17 01:27:05 -06:00
if not self._webdriver:
self._webdriver = browser.create_webdriver(config.obj.WEBDRIVER_BROWSER)
2017-08-17 01:45:07 -06:00
page_api = pages.LandingPage(self.webdriver)
page_api.goto_landing_page()
2017-08-17 01:27:05 -06:00
return self._webdriver
def open_browser(self):
x = self.webdriver # Request the browser open immediately.
# Without this, the Model object will
# be created on-demand (as defined in
# the Model class). This means that the
# web browser will not open until
# a command is typed in. But because we
# request the webdriver right here, Model
# creates it and then it is re-used later
# in the application.
2017-08-17 01:27:05 -06:00
def do_random_page(self):
"""
Select a random page and repeatedly click the first link until
we reach the article on philosophy. Sometimes, the driver encounters
a loop and will never reach the page and sometimes the parser
fails and we fail to programmatically implement what we're trying to
do correctly.
"""
# The following 3 lines include the functionality
# for the Landing page (select language).
# This line creates a new object (page_api) which is an instance
# of type class pages.LandingPage. LandingPage is a variable
# containing a class definition that is located in the pages module.
# We pass self.webdriver as arguments into the LandingPage.__init__
# constructor.
2017-08-17 01:27:05 -06:00
page_api = pages.LandingPage(self.webdriver)
# This line calls the page_api object's (an instance of
# pages.LandingPage type) method goto_landing_page.
2017-08-17 01:27:05 -06:00
page_api.goto_landing_page()
# Similarly, this line calls the select_language method
# and passes in values from our runtime configuration.
# In this case, we have made the language a parameter
# that you can pass into the program, i.e. you can run it
# for English or Spanish or Russian or what have you.
page_api.select_language(config.obj.PAGE_LANGUAGE)
# Main page: next 2 lines
# At this point, we have clicked a link and changes the page. We
# re-create our page interface as a new object which is of
# a different class and includes distinct code for working with
# the page. In this case, we delete page_api and re-create it
# as an object of type pages.MainPage. Again, we pass in
# self.webdriver as an object of the selenium webdriver interface.
# The page_api calls methods on this webdriver to make the web
# browser do various things like click links or extract text.
2017-08-17 01:27:05 -06:00
page_api = pages.MainPage(self.webdriver)
# We call pages.MainPage.goto_random_article() to perform
# the action we're trying to invoke.
2017-08-17 01:27:05 -06:00
page_api.goto_random_article()
# Article page
pages_visited = []
while True:
page_api = pages.ArticlePage(self.webdriver)
title = page_api.get_title()
logging.debug('visited page: %s' % title)
if title in pages_visited:
logging.info('encountered loop at page = %s' % title)
break
if title == 'Philosophy':
logging.info('made it to philosophy in %s pages' % len(pages_visited))
pages_visited.append(title)
break
pages_visited.append(title)
rc = page_api.click_first_link()
if not rc:
logging.warn('failure: unable to continue (perhaps no valid links?)')
break
print()