#!/usr/bin/python
#
ver=0.07
# This version not ready for prime time
# scrapes craigslist for work
# pflint Sat 19 Apr 2014 09:42:23 AM EDT 
# Guidance on soup https://www.crummy.com/software/BeautifulSoup/bs4/doc/
# Local guidance: http://docbox.flint.com:8081/boj#BeautifulsoupInstall
# Additional Guidance:  http://docbox.flint.com:8081/boj#PythonBased
# Past Programs: http://docbox.flint.com:8081/boj#programs
# 2014-10-06 08:03:39 need error checking on open!!!
#
# PYTHONIOENCODING=utf-8 # die unicode die
#
import os
import re
import sys
import stat
import optparse
import subprocess
import unicodedata
import doctest
import signal
import time
import io
import csv
import string
from subprocess import call
# from BeautifulSoup import BeautifulSoup
from bs4 import BeautifulSoup
import urllib2
# base="https://montreal.fr.craigslist.ca/search/"
cityw = {
	'Burlington VT':'https://burlington.craigslist.org',
	'Washington DC':'https://washingtondc.craigslist.org',
	'New York NY':'https://newyork.craigslist.org',
	'Albany NY':'https://albany.craigslist.org',
	'Eastern CT':'https://newlondon.craigslist.org',
	'Hartford CT':'https://hartford.craigslist.org',
	'New Haven CT':'https://newhaven.craigslist.org',
	'Northwest CT':'https://nwct.craigslist.org',	
	'Montreal CA':'https://montreal.craigslist.ca' 
	}
#	Above is the real data set.  Below is the test one.
test_cityw = {
	'Burlington VT':'https://burlington.craigslist.org',
	'Washington DC':'https://washingtondc.craigslist.org',
	'New York NY':'https://newyork.craigslist.org',
	'Montreal CA':'https://montreal.fr.craigslist.ca',
	'Montreal CA':'https://montreal.en.craigslist.ca',
	'Montreal CA':'https://montreal.craigslist.ca'
	}
# 
specs =['eng','sad','sof','tch']
test_specs =['sof','tch']
for key, value in dict.items(cityw):
	# print key,value
	city=key
	base=value
	print '<p>  <B>'+city+'</B> </p>'
	# print city+"\t"+base 
	for spec in specs:
		url=base+"/search/"+spec+"?is_telecommuting=1"
		# +"?addOne=telecommuting"
		# print url
		# print base
		# Create the soup
		page=urllib2.urlopen(url)
		soup = BeautifulSoup(page.read(),"lxml")
		# print(soup.find_all('a'))
		# print(soup.title)
		#Search the soup
		# 2014-10-06 08:06:57 need a filter list.
		# print(soup.find_all('a'))
		# sline = soup.find_all('a',{'class':'row'})
		# sline = soup.find_all('a')
		sline = soup.find_all('a','hdrlnk')
		# print len(sline)
		for i in range(len(sline)):
			job=str(sline[i]).split('href="',1)[1]
			# print i, job, job[:2], 
			if '//' == job[:2]:
				print '<p>  <a href="https:'+job
			else: 
				print '<p>  <a href="'+base+str(sline[i]).split('href="',1)[1]+' </p>'
		#
'''
		# print i, str(sline[i]).split('href="',1)[1]
		# print i, '<a "'+base+str(sline[i]).split('href="',1)[1]
		# print '<a href="'+base+str(sline[i]).split('href="',1)[1]
		# print "sline[1:2]: ", sline[1:2]
		# print ' '.join(unicode(sline))
		# print base+'\n'.join(map(str, sline))
		job='\n'.join(map(str, sline))
		# print len(job)
		# print job
		# print job.split("href=",1)[1]
		#d print job.split("href=")[1]
		# sline = soup.findAll('p',{'class':'row'})
		# print sline
		# print('<p> '+city+' '+spec+' </p>')	
		# print('<base href="'+base+'">')
		burl='href="'+base[0:(len(base)-7)]
		# print burl
		# dline is a unicode bs4.element.Tag
		for dinstance in sline:
			udl=dinstance.encode("utf-8")
			mline=str([udl])
			dline=mline[2:(len(mline)-2)]
			# print string.replace(dline,'href="/',burl);

'''
