118 lines
5.1 KiB
Python
118 lines
5.1 KiB
Python
# model module contains the business logic of the program. Notice
|
|
# the command-line interface contains no business logic and only
|
|
# has functionality to call on the model. Similarly, the page
|
|
# objects define no specific functionality for doing the wiki crawl
|
|
# but only provide general utility methods that are called upon
|
|
# to implement the wiki crawl. This is a separation of concerns
|
|
# and keeps the logic organized and separated.
|
|
|
|
import logging
|
|
import os
|
|
import time
|
|
|
|
from . import browser
|
|
from . import config
|
|
from . import dal
|
|
from . import log
|
|
from . import pages
|
|
|
|
class Model:
|
|
def __init__(self):
|
|
self._webdriver = None
|
|
|
|
@property
|
|
def webdriver(self):
|
|
# The way this works is when an object instance of class/type Model
|
|
# is called with x.webdriver, Model runs webdriver(). In our case,
|
|
# the webdriver() method checks if a private variable self._webdriver
|
|
# exists and if it isn't, asks for a new selenim object. The result
|
|
# is that this will on-demand create a browser. If one exists, it will
|
|
# use the one that exists and if one doesn't exists, it will create
|
|
# one and use that. External code can rely on self.webdriver
|
|
# always existing with or without knowing if it exists because if it
|
|
# hasn't been created yet then it will be created on-the-fly.
|
|
if not self._webdriver:
|
|
self._webdriver = browser.create_webdriver(config.obj.WEBDRIVER_BROWSER)
|
|
page_api = pages.LandingPage(self.webdriver)
|
|
page_api.goto_landing_page()
|
|
return self._webdriver
|
|
|
|
def open_browser(self):
|
|
x = self.webdriver # Request the browser open immediately.
|
|
# Without this, the Model object will
|
|
# be created on-demand (as defined in
|
|
# the Model class). This means that the
|
|
# web browser will not open until
|
|
# a command is typed in. But because we
|
|
# request the webdriver right here, Model
|
|
# creates it and then it is re-used later
|
|
# in the application.
|
|
|
|
def do_random_page(self):
|
|
"""
|
|
Select a random page and repeatedly click the first link until
|
|
we reach the article on philosophy. Sometimes, the driver encounters
|
|
a loop and will never reach the page and sometimes the parser
|
|
fails and we fail to programmatically implement what we're trying to
|
|
do correctly.
|
|
"""
|
|
# The following 3 lines include the functionality
|
|
# for the Landing page (select language).
|
|
|
|
# This line creates a new object (page_api) which is an instance
|
|
# of type class pages.LandingPage. LandingPage is a variable
|
|
# containing a class definition that is located in the pages module.
|
|
# We pass self.webdriver as arguments into the LandingPage.__init__
|
|
# constructor.
|
|
page_api = pages.LandingPage(self.webdriver)
|
|
|
|
# This line calls the page_api object's (an instance of
|
|
# pages.LandingPage type) method goto_landing_page.
|
|
page_api.goto_landing_page()
|
|
|
|
# Similarly, this line calls the select_language method
|
|
# and passes in values from our runtime configuration.
|
|
# In this case, we have made the language a parameter
|
|
# that you can pass into the program, i.e. you can run it
|
|
# for English or Spanish or Russian or what have you.
|
|
page_api.select_language(config.obj.PAGE_LANGUAGE)
|
|
|
|
# Main page: next 2 lines
|
|
|
|
# At this point, we have clicked a link and changes the page. We
|
|
# re-create our page interface as a new object which is of
|
|
# a different class and includes distinct code for working with
|
|
# the page. In this case, we delete page_api and re-create it
|
|
# as an object of type pages.MainPage. Again, we pass in
|
|
# self.webdriver as an object of the selenium webdriver interface.
|
|
# The page_api calls methods on this webdriver to make the web
|
|
# browser do various things like click links or extract text.
|
|
page_api = pages.MainPage(self.webdriver)
|
|
|
|
# We call pages.MainPage.goto_random_article() to perform
|
|
# the action we're trying to invoke.
|
|
page_api.goto_random_article()
|
|
|
|
# Article page
|
|
pages_visited = []
|
|
while True:
|
|
page_api = pages.ArticlePage(self.webdriver)
|
|
|
|
title = page_api.get_title()
|
|
logging.debug('visited page: %s' % title)
|
|
if title in pages_visited:
|
|
logging.info('encountered loop at page = %s' % title)
|
|
break
|
|
if title == 'Philosophy':
|
|
logging.info('made it to philosophy in %s pages' % len(pages_visited))
|
|
pages_visited.append(title)
|
|
break
|
|
pages_visited.append(title)
|
|
|
|
rc = page_api.click_first_link()
|
|
if not rc:
|
|
logging.warn('failure: unable to continue (perhaps no valid links?)')
|
|
break
|
|
print()
|
|
|