Defining a crawlSleep procedure. This procedure will utilize the datetime module to calculate how many days have passed since the last crawl. It will remain in sleep* mode (within an appropriate while loop) until the time elapsed since the last crawl is less than one week. If one week has passed since the last crawl, it will initiate a recrawl and update the crawl date.However,i have to define days elapsed function without use built-in functions(use the datetime module solely to obtain the current date. Calculate passed days using your own procedures. Do not use built-in functions for this purpose.),
Html parsing is not very important at this stage, I. want to learn ho to define this function()crawl sleep:Here is my code:
import datetime
def get_page(url):
try:
import urllib.request
page = urllib.request.urlopen(url).read()
page = page.decode("utf-8")
return page
except:
return ""
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote+1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def updateCrawlDate(index, keyword, url, date):
for entry in index:
if entry[0] == keyword:
entry[entry.index(url)+1] = date
return
index.append([keyword, url, date])
def add_toIndex(index, keyword, url,date):### we must be also changed here
#date = datetime.datetime.now().strftime("%d-%m-%y")
if keyword in index:
index[keyword].append({url,date})
#index[keyword].append({date})
else:
index[keyword]=[{url}]
index[keyword]=[{'test:'+date}]
def getclearpage(content):
title = content[content.find("<title>")+7:content.find("</title>")]
body = content[content.find("<body>")+6:content.find("</body>")]
while body.find(">") != -1:
start = body.find("<")
end = body.find(">")
body = body[:start] + body[end+1:]
return title + body
def addPageToIndex(index, url, content,date):
content = getclearpage(content)
words = content.split()
#date= datetime.datetime.now().strftime("%d-%m-%y")
for word in words:
add_toIndex(index, word, url,date)
def crawlWeb(seed):
tocrawl = [seed]
crawled = []
index = {} ### i changed list to dictionary
global last_crawldate ##
last_crawldate= datetime.datetime.now().strftime("%y-%m-%d")
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
addPageToIndex(index, page, content,"LastcrawlDate:" +last_crawldate)
union(tocrawl, get_all_links(get_page(page)))
crawled.append(page)
return index #, last_crawldate
ef crawlSleep(index,seed):
global last_crawldate
while True:
current_time = datetime.datetime.now().strftime("%y-%m-%d")
daysPassed = current_time - last_crawldate ## i know that these are not the data types we need to apply on them ..
if daysPassed >= 7 :
return crawlWeb(seed)
else:
return crawlSleep(index,seed)