How can I define sleep function without use sleep() when build a web crawler?

60 Views Asked by At

Defining a crawlSleep procedure. This procedure will utilize the datetime module to calculate how many days have passed since the last crawl. It will remain in sleep* mode (within an appropriate while loop) until the time elapsed since the last crawl is less than one week. If one week has passed since the last crawl, it will initiate a recrawl and update the crawl date.However,i have to define days elapsed function without use built-in functions(use the datetime module solely to obtain the current date. Calculate passed days using your own procedures. Do not use built-in functions for this purpose.),

Html parsing is not very important at this stage, I. want to learn ho to define this function()crawl sleep:Here is my code:

import datetime

def get_page(url):
  try:
    import urllib.request
    page = urllib.request.urlopen(url).read()
    page = page.decode("utf-8")
    return page
  except:
    return ""


def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote+1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote


def get_all_links(page):
    links = []
    while True:
      url, endpos = get_next_target(page)
      if url:
        links.append(url)
        page = page[endpos:]
      else:
        break
    return links


def union(p,q):
    for e in q:
        if e not in p:
            p.append(e)
def updateCrawlDate(index, keyword, url, date):
  for entry in index:
    if entry[0] == keyword:
      entry[entry.index(url)+1] = date
      return
  index.append([keyword, url, date])



def add_toIndex(index, keyword, url,date):### we must be also changed here
   #date = datetime.datetime.now().strftime("%d-%m-%y")
   if keyword in index:
      index[keyword].append({url,date})
      #index[keyword].append({date})
   else:
      index[keyword]=[{url}]
      index[keyword]=[{'test:'+date}]
      

def getclearpage(content):
  title = content[content.find("<title>")+7:content.find("</title>")]
  body = content[content.find("<body>")+6:content.find("</body>")]
  while body.find(">") != -1:
    start =  body.find("<")
    end =  body.find(">")
    body = body[:start] + body[end+1:]
  return title + body


def addPageToIndex(index, url, content,date):
  content = getclearpage(content)
  words = content.split()
  #date=  datetime.datetime.now().strftime("%d-%m-%y")
  for word in words:
    add_toIndex(index, word, url,date)


def crawlWeb(seed):
  tocrawl = [seed]
  crawled = []
  index = {} ### i changed list to dictionary
  global last_crawldate ##
  last_crawldate= datetime.datetime.now().strftime("%y-%m-%d")
  while tocrawl:
    page = tocrawl.pop()
    if page not in crawled:
      content = get_page(page)
      addPageToIndex(index, page, content,"LastcrawlDate:" +last_crawldate)
      union(tocrawl, get_all_links(get_page(page)))
      crawled.append(page)
  return index #, last_crawldate
ef crawlSleep(index,seed):
    global last_crawldate
    while True:
      current_time = datetime.datetime.now().strftime("%y-%m-%d")
      daysPassed = current_time - last_crawldate  ## i know that these are not the data types we need to apply on them ..  
      if daysPassed >= 7 :
            return crawlWeb(seed)
      else:
          return crawlSleep(index,seed)
0

There are 0 best solutions below