You will need to get sqlite for this tutorial:
pip install sqlite
If you need help with pip and installing packages, check out the .
import time
import urllib2
from urllib2 import urlopen
import re
import cookielib, urllib2
from cookielib import CookieJar
import datetime
import sqlite3
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
conn = sqlite3.connect('knowledgeBase.db')
c = conn.cursor()
def createDB():
c.execute("CREATE TABLE knowledgeBase (unix REAL, datestamp TEXT, namedEntity TEXT, relatedWord TEXT)")
c.commit()
def main():
try:
page = 'http://feeds.huffingtonpost.com/huffingtonpost/raw_feed'
sourceCode = opener.open(page).read()
#print sourceCode
try:
titles = re.findall(r'<title>(.*?)</title>',sourceCode)
links = re.findall(r'<link.*?href=\"(.*?)\"',sourceCode)
#for title in titles:
#print title
for link in links:
if '.rdf' in link:
pass
else:
print 'let\'s visit:', link
print ' _____________________________________'
linkSource = opener.open(link).read()
linesOfInterest = re.findall(r'<p>(.*?)</p>',str(linkSource))
print 'Content:'
for eachLine in linesOfInterest:
if '<img width' in eachLine:
pass
elif '<a href=' in eachLine:
pass
else:
print eachLine
time.sleep(1)
except Exception, e:
print str(e)
except Exception,e:
print str(e)
pass
createDB()