#!/usr/bin/python # ver=0.07 # This version not ready for prime time # scrapes craigslist for work # pflint Sat 19 Apr 2014 09:42:23 AM EDT # Guidance on soup https://www.crummy.com/software/BeautifulSoup/bs4/doc/ # Local guidance: http://docbox.flint.com:8081/boj#BeautifulsoupInstall # Additional Guidance: http://docbox.flint.com:8081/boj#PythonBased # Past Programs: http://docbox.flint.com:8081/boj#programs # 2014-10-06 08:03:39 need error checking on open!!! # # PYTHONIOENCODING=utf-8 # die unicode die # import os import re import sys import stat import optparse import subprocess import unicodedata import doctest import signal import time import io import csv import string from subprocess import call # from BeautifulSoup import BeautifulSoup from bs4 import BeautifulSoup import urllib2 # base="https://montreal.fr.craigslist.ca/search/" cityw = { 'Burlington VT':'https://burlington.craigslist.org', 'Washington DC':'https://washingtondc.craigslist.org', 'New York NY':'https://newyork.craigslist.org', 'Albany NY':'https://albany.craigslist.org', 'Eastern CT':'https://newlondon.craigslist.org', 'Hartford CT':'https://hartford.craigslist.org', 'New Haven CT':'https://newhaven.craigslist.org', 'Northwest CT':'https://nwct.craigslist.org', 'Montreal CA':'https://montreal.craigslist.ca' } # Above is the real data set. Below is the test one. test_cityw = { 'Burlington VT':'https://burlington.craigslist.org', 'Washington DC':'https://washingtondc.craigslist.org', 'New York NY':'https://newyork.craigslist.org', 'Montreal CA':'https://montreal.fr.craigslist.ca', 'Montreal CA':'https://montreal.en.craigslist.ca', 'Montreal CA':'https://montreal.craigslist.ca' } # specs =['eng','sad','sof','tch'] test_specs =['sof','tch'] for key, value in dict.items(cityw): # print key,value city=key base=value print '

'+city+'

' # print city+"\t"+base for spec in specs: url=base+"/search/"+spec+"?is_telecommuting=1" # +"?addOne=telecommuting" # print url # print base # Create the soup page=urllib2.urlopen(url) soup = BeautifulSoup(page.read(),"lxml") # print(soup.find_all('a')) # print(soup.title) #Search the soup # 2014-10-06 08:06:57 need a filter list. # print(soup.find_all('a')) # sline = soup.find_all('a',{'class':'row'}) # sline = soup.find_all('a') sline = soup.find_all('a','hdrlnk') # print len(sline) for i in range(len(sline)): job=str(sline[i]).split('href="',1)[1] # print i, job, job[:2], if '//' == job[:2]: print '

') burl='href="'+base[0:(len(base)-7)] # print burl # dline is a unicode bs4.element.Tag for dinstance in sline: udl=dinstance.encode("utf-8") mline=str([udl]) dline=mline[2:(len(mline)-2)] # print string.replace(dline,'href="/',burl); '''