# model module contains the business logic of the program. Notice # the command-line interface contains no business logic and only # has functionality to call on the model. Similarly, the page # objects define no specific functionality for doing the wiki crawl # but only provide general utility methods that are called upon # to implement the wiki crawl. This is a separation of concerns # and keeps the logic organized and separated. import logging import os import time from . import browser from . import config from . import dal from . import log from . import pages class Model: def __init__(self): self._webdriver = None @property def webdriver(self): # The way this works is when an object instance of class/type Model # is called with x.webdriver, Model runs webdriver(). In our case, # the webdriver() method checks if a private variable self._webdriver # exists and if it isn't, asks for a new selenim object. The result # is that this will on-demand create a browser. If one exists, it will # use the one that exists and if one doesn't exists, it will create # one and use that. External code can rely on self.webdriver # always existing with or without knowing if it exists because if it # hasn't been created yet then it will be created on-the-fly. if not self._webdriver: self._webdriver = browser.create_webdriver(config.obj.WEBDRIVER_BROWSER) page_api = pages.LandingPage(self.webdriver) page_api.goto_landing_page() return self._webdriver def open_browser(self): x = self.webdriver # Request the browser open immediately. # Without this, the Model object will # be created on-demand (as defined in # the Model class). This means that the # web browser will not open until # a command is typed in. But because we # request the webdriver right here, Model # creates it and then it is re-used later # in the application. def do_random_page(self): """ Select a random page and repeatedly click the first link until we reach the article on philosophy. Sometimes, the driver encounters a loop and will never reach the page and sometimes the parser fails and we fail to programmatically implement what we're trying to do correctly. """ # The following 3 lines include the functionality # for the Landing page (select language). # This line creates a new object (page_api) which is an instance # of type class pages.LandingPage. LandingPage is a variable # containing a class definition that is located in the pages module. # We pass self.webdriver as arguments into the LandingPage.__init__ # constructor. page_api = pages.LandingPage(self.webdriver) # This line calls the page_api object's (an instance of # pages.LandingPage type) method goto_landing_page. page_api.goto_landing_page() # Similarly, this line calls the select_language method # and passes in values from our runtime configuration. # In this case, we have made the language a parameter # that you can pass into the program, i.e. you can run it # for English or Spanish or Russian or what have you. page_api.select_language(config.obj.PAGE_LANGUAGE) # Main page: next 2 lines # At this point, we have clicked a link and changes the page. We # re-create our page interface as a new object which is of # a different class and includes distinct code for working with # the page. In this case, we delete page_api and re-create it # as an object of type pages.MainPage. Again, we pass in # self.webdriver as an object of the selenium webdriver interface. # The page_api calls methods on this webdriver to make the web # browser do various things like click links or extract text. page_api = pages.MainPage(self.webdriver) # We call pages.MainPage.goto_random_article() to perform # the action we're trying to invoke. page_api.goto_random_article() # Article page pages_visited = [] while True: page_api = pages.ArticlePage(self.webdriver) title = page_api.get_title() logging.debug('visited page: %s' % title) if title in pages_visited: logging.info('encountered loop at page = %s' % title) break if title == 'Philosophy': logging.info('made it to philosophy in %s pages' % len(pages_visited)) pages_visited.append(title) break pages_visited.append(title) rc = page_api.click_first_link() if not rc: logging.warn('failure: unable to continue (perhaps no valid links?)') break print()