Simple Dmoz Scrapping

Python makes it really simple to scrape websites for content. This is such a basic example that I’m really only posting this code to test syntax highlighting on my blog engine.

It’s a class that scrapes URLs from Dmoz given a search term. Not fancy, I’m just using it for a machine learning project. It does what I need for now.

 import mechanize
 from BeautifulSoup import BeautifulSoup

 class Dmoz(object):
     def __init__(self):
         self.br = mechanize.Browser()

     def get_page_urls(self, term):
         result = self.br.open("http://www.dmoz.org/search?q="+term)
         result_html = result.read()
         soup = BeautifulSoup(result_html)
         sites_obj = soup.find('ol', {"class": "site"})
         if sites_obj:
             sites = sites_obj('li')
             urls = [x('a', recursive=False)[0]['href'] for x in sites]
             return urls
         else:
             return []

 def main():
     # eg:
     dm = Dmoz()
     print dm.get_page_urls("Computer Science")

 if __name__ == "__main__":
     main()