More Named Entity Recognition with NLTK




import time
import urllib2
from urllib2 import urlopen
import re
import cookielib
from cookielib import CookieJar
import datetime
import sqlite3
##
import nltk


cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', 'Mozilla/5.0')]


conn = sqlite3.connect('knowledgeBase.db')
c = conn.cursor()


###define processor###
def processor(data):
    namedEntArray = []
    try:
        tokenized = nltk.word_tokenize(data)
        tagged = nltk.pos_tag(tokenized)
        namedEnt = nltk.ne_chunk(tagged, binary=True)

        entities = re.findall(r'NE\s(.*?)/',str(namedEnt))
        #('not', 'RB')
        descriptives = re.findall(r'\(\'(\w*)\',\s\'JJ\w?\'', str(tagged))
        if len(entities) > 1:
            pass
        elif len(entities) == 0:
            pass
        else:
            print '_________________________'
            print 'Named:',entities[0]
            print 'Descriptions:'
            for eachDesc in descriptives:
                print eachDesc


    except Exception, e:
        print 'failed in the main try of processor'
        print str(e)
        time.sleep(555)
    

def huffingtonRSSvisit():
    try:
        page = 'http://feeds.huffingtonpost.com/huffingtonpost/raw_feed'
        sourceCode = opener.open(page).read()
        try:
            links = re.findall(r'<link.*href=\"(.*?)\"', sourceCode)
            for link in links:
                if '.rdf' in link:
                    pass
                else:
                    print 'visiting the link'
                    print '###################'
                    linkSource  = opener.open(link).read()
                    linesOfInterest = re.findall(r'<p>(.*?)</p>', str(linkSource))
                    print 'Content:'
                    for eachLine in linesOfInterest:
                        if '<img width' in eachLine:
                            pass
                        elif '<a href=' in eachLine:
                            pass
                        else:
                            #change this#
                            processor(eachLine)
                        
                        

                    time.sleep(5)


        except Exception, e:
            print 'failed 2nd loop of huffingtonRSS'
            print str(e)


    except Exception, e:
        print 'failed main loop of huffingtonRSS'
        print str(e)


huffingtonRSSvisit()
		

The next tutorial:





  • Simple RSS feed scraping
  • Simple website scraping
  • More Parsing/Scraping
  • Installing the Natural Language Toolkit (NLTK)
  • NLTK Part of Speech Tagging Tutorial
  • Named Entity Recognition NLTK tutorial
  • Building a Knowledge-base
  • More Named Entity Recognition with NLTK
  • Pulling related Sentiment about Named Entities
  • Populating a knowledge-base
  • What next?
  • Accuracy Testing
  • Building back-testing
  • Machine learning and Sentiment Analysis