#!/usr/bin/python # Read seasonal snowfalls from the Web import sys import urllib import html2text # Unfortunately, NOAA has stopped making this information _easily_ available. # This URL no longer works, and so we're using a copy of an old web page. # html = urllib.urlopen("http://www.erh.noaa.gov/box/climate/bossnw.shtml").read() # html = urllib.urlopen("http://www.ccs.neu.edu/course/cs3650/snowfall-boston.html").read() html = urllib.urlopen("https://course.ccs.neu.edu/cs7600/parent/python/snowfall-boston.html").read() #======================================= # Read the html data into a text file. if "1891-92" in html: # if URL taken from noaa.gov or a copy of it delim = "1891-92" delim_end = "\n\n" elif "1936" in html: # else if URL taken from wrcc.dri.edu delim = "1936" delim_end = "\n2016" # Convert raw html to text file. html2text.BODY_WIDTH = 0 # The stupid html2text was writing '\n\n' html = html2text.html2text(html) # html2text was producing unicode. ASCII is good enough in the US of A. :-) html = html.encode("ascii", "ignore") # ignore non-ASCII characters # The stupid html2text was writing '\n\n'; Finish patching the output. html = html.replace('\n', '\t') html = html.replace('\t\t', '\n') else: print "*** unknown format ***" #======================================= # Now process the html data, to get the record years. data1 = delim + html.split(delim)[1] data2 = data1.split(delim_end)[0] a = data2.split('\n') b = [ (x.split()[0], x.split()[-1]) for x in a ] c = [ (float(x[-1]), x[0]) for x in b ] d = sorted( c, key=lambda x: x[0] ) d.reverse() # Largest snowfalls first print( "Ten largest snowfalls:\n" + '\n'.join( [str((x[1], x[0])) for x in d[:10]] ) ) #================================= #NOTE if using Python3, do this instead for html: # import urllib.request # html = urllib.request.urlopen("http://www.erh.noaa.gov/box/climate/bossnw.shtml").read().decode('utf-8')