import logging import re import selenium import selenium.webdriver import time settings = {} def init(settings_obj): global settings settings = settings_obj def breakpoint(): if settings.DO_BREAKPOINTS: input('Breakpoint here. to continue...') class PageRootObject: def __init__(self, driver=None): if not driver: self.driver = create_webdriver() else: self.driver = driver def click(self, el): self.highlight(el, 'red') time.sleep(settings.PAGE_DELAY) el.click() def highlight(self, el, color): if color == 'red': js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292' elif color == 'blue': js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff' self.driver.execute_script(js, el) class LandingPage(PageRootObject): def __init__(self, driver=None): super().__init__(driver) def goto_landing_page(self): self.driver.get(settings.PAGE_BASE_URL) def select_language(self, language): link = self.driver.find_element_by_partial_link_text(language) self.click(link) class MainPage(PageRootObject): def __init__(self, driver=None): super().__init__(driver) def goto_random_article(self): link = self.driver.find_element_by_xpath('//li[contains(@id, "n-randompage")]/a') self.click(link) class ArticlePage(PageRootObject): elements = { 'main-window-content-text-id': 'mw-content-text', 'article-title': 'firstHeading', } def __init__(self, driver=None): super().__init__(driver) def get_title(self): heading = self.driver.find_element_by_id(ArticlePage.elements['article-title']) return heading.text def click_first_link(self): return self._iterate_paragraphs() def _iterate_paragraphs(self): main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id']) paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]') for p in paragraphs: rc = self._parse_paragraph(p) if rc: return True def _parse_paragraph(self, p): links = p.find_elements_by_xpath('.//a') if len(links) == 0: return False for link in links: logging.debug('processing link: %s' % link.text) if not self._is_valid_link(p, link): logging.debug('skipping link inside parenthesis: %s' % link.text) self.highlight(link, 'blue') continue self.highlight(link, 'red') logging.info('selected link: %s' % link.text) breakpoint() link.click() return True def _is_valid_link(self, p, el): a = self._is_link_in_parenthesis(p, el) b = self._is_link_a_footnote(el) c = self._is_link_pronounciation(el) d = self._is_link_audio(el) print(a, b, c, d) if not a and not b and not c and not d: return True return False def _is_link_in_parenthesis(self, p, el): # link_text = el.text link_text = el.get_attribute('outerHTML') p_text = p.get_attribute('innerHTML') regex_str = '\(.*?\)' regex = re.compile(regex_str, flags=re.UNICODE) match = regex.search(p_text) if not match: return False while match is not None: match_text = match.group(0) match_idx = match.end(0) print(link_text) print(match_text) if link_text in match_text: return True match = regex.search(p_text, match_idx+1) # Is the link inside parenthesis? # regex_str = '\([^()]*%s[^())]*\)' % re.escape(link_text) # regex_str = '\(.*%s.*\)' % link_text # print(regex_str) # regex = re.compile(regex_str, flags=re.UNICODE) # match = re.search(regex_str, p_text) # if match: # Pattern is found in the text # print(match.group(0)) # return True # else: # return False def _is_link_a_footnote(self, el): href = el.get_attribute('href') if '#cite_note' in href: return True if '#cnote' in href: return True return False def _is_link_pronounciation(self, el): href = el.get_attribute('href') if '/wiki/Help:IPA' in href: return True return False def _is_link_audio(self, el): href = el.get_attribute('href') if '.ogg' in href: return True return False