Saturday
Apr302011
DMOZ Scraper
Saturday, April 30, 2011 at 03:57PM I’m really only posting this code so I can check out the syntax highlighter in jekyll.
It’s a class that scrapes URLs from Dmoz given a search term. Not fancy, I’m just using it for a machine learning project. It does what I need for now.
import mechanize
from BeautifulSoup import BeautifulSoup
class Dmoz(object):
def __init__(self):
self.br = mechanize.Browser()
def get_page_urls(self, term):
result = self.br.open("http://www.dmoz.org/search?q="+term)
result_html = result.read()
soup = BeautifulSoup(result_html)
sites_obj = soup.find('ol', {"class": "site"})
if sites_obj:
sites = sites_obj('li')
urls = [x('a', recursive=False)[0]['href'] for x in sites]
return urls
else:
return []
def main():
# eg:
dm = Dmoz()
print dm.get_page_urls("Computer Science")
if __name__ == "__main__":
main()