#!/usr/bin/python # # This is -*- python -*- code # USER_AGENT = 'FindNew/0.6' # Put this here so I don't forget to update it # FindNew.py 0.6: # # Traverses a bookmarks file or web page to find out what items # have been updated lately. Similar to a feature in Netscape 2.0's # bookmarks, but more HTML-friendly. # # Chris Lawrence # Freely distributable # 16 May 1998 # TO DO: # Use multiple sockets at once # Better exceptions # Changes since 0.5: # Now uses Python 1.5+'s rfc822.mktime_tz() function; this may even work # right in 1.5.2 (patches for 1.5.1 should be on the Python patch page soon) # Changes since 0.4: # Changes toward HTTP/1.1 compliance # . Host header # . Connection header # Updated for Python 1.5; probably incompatible with < 1.5 # Changes since 0.3: # Now handles timezones better. Requires patch to rfc822.py posted to # comp.lang.python by Guido (NOT my patch) # Shows the dates of modified pages # You can now specify the number of days old a page can be to be considered # updated. # Added easier ignored services list (rather than mega-if) # Ignore nntp and pop3 URLs # Fixed a bug that made unupdated pages look like they had no modification # time. # Changes since 0.2: # It now makes some HTML and spawns your web browser to look # at the results. # Changes since 0.1: # What it says is actually correct ;) # Better handling of mail and news URLs # No longer hosed if you don't have a WWW_HOME env var. import sys, os, time, urlparse, string, httplib, htmllib, urllib, formatter import socket, rfc822 from types import * HOME_URL = 'http://www.clark.net/pub/lawrencc/linux/findnew.py' WEB_BROWSER = 'lynx' PROXY = None # Use 'hostname:port' TMP_PAT = '/tmp/jumplist%d.html' # Lynx seems to need the extension DEBUG = 1 # Use 0 to switch off debugging # pop3 is used by Netscape IgnoreServices = ['mailto', 'news', 'telnet', 'newspost', 'nntp', 'pop3'] def check_url(url): servname, hostandport, server_url, parameters, query, fragment =\ urlparse.urlparse(url, 'file'); if server_url == '': server_url = '/' if servname == 'http': # All we do is send a HEAD request, and hope # like hell that we're dealing with a HTTP 1.0+ # compliant server (i.e. not an ancient CERN or NCSA) if PROXY is None: hreq = httplib.HTTP(hostandport) hreq.putrequest('HEAD',server_url) else: # Hopefully the proxy is smart enough to strip # any fragment from the request [if not, who cares?] hreq = httplib.HTTP(PROXY) hreq.putrequest('HEAD',url) hreq.putheader('User-Agent',USER_AGENT) # To be polite hreq.putheader('Host', hostandport) # HTTP/1.1 Host header hreq.putheader('Connection', 'close') # Non-persistent cnxn hreq.endheaders() errcode, errmsg, headers = hreq.getreply() if errcode == 302 or errcode == 301: # Handle redirected URLs correctly urlretry = headers.getheader('Location') if urlretry is None: raise IOError, 'Unable to obtain new '+\ 'location.' return check_url(urlretry) elif errcode == -1: raise IOError, 'Invalid response from server: '+\ `errmsg` elif errcode != 200: raise IOError, 'Unhandled reply from server: '+\ `errcode`+' '+`errmsg` datecode = headers.getdate_tz('Last-modified') return datecode elif servname == 'ftp' or servname == 'file': # Probably easy enough to implement... # but I'm lazy raise ValueError, "ftp service unimplemented" elif servname == 'https': raise ValueError, 'SSL secure HTTP unimplemented' elif servname in IgnoreServices: return 0 else: raise ValueError, servname+" service unimplemented" def datestr(timeval): try: return time.strftime('%b %d %Y at %I:%M %p %Z', time.localtime(timeval)) except: return 'Invalid date: '+`timeval` def is_url_newer(url, lastmod): try: date = check_url(url) if date: juldate = rfc822.mktime_tz(date) # print datestr(juldate) if (juldate > lastmod): return juldate else: return 0 elif date == 0: return 0 else: return -1 except (IOError, ValueError, socket.error), x: return str(x) def main(): # We'll play with the HTML parsing in Python for this stuff # Basically we retrieve the specified URL, parse for "A" # tags with HREF's, and go from there. sincedate = 2 if len(sys.argv) > 1: url = sys.argv[1] if len(sys.argv) > 2: sincedate = string.atoi(sys.argv[2]) elif os.environ.has_key('WWW_HOME'): url = os.environ['WWW_HOME'] else: print 'You must specify a URL or a filename.' return # Handle modification times if sincedate < 1: sincedate = 1 lastchange = time.time() - (sincedate*24*60*60) if DEBUG: print 'Checking for pages updated since '+\ datestr(lastchange) # Canonicalize the URL stuff = urlparse.urlparse(url, 'file') url = urlparse.urlunparse( (stuff[0], stuff[1], os.path.join(os.getcwd(), stuff[2]), stuff[3], stuff[4], stuff[5]) ) if DEBUG: print 'Canonical URL:',url data = urllib.urlopen(url).read(); parser = htmllib.HTMLParser(formatter.NullFormatter(None)) parser.feed(data) if(parser.base is not None): url = parser.base pagelist = {} for x in parser.anchorlist: (pagex, blah) = urlparse.urldefrag(urlparse.urljoin(url, x)) if not pagelist.has_key(pagex): if DEBUG: print 'Checking',pagex ok = is_url_newer(pagex, lastchange) pagelist[pagex] = (ok, (x)) else: pagelist[pagex] = (pagelist[pagex][0], \ pagelist[pagex][1]+(x)) if len(pagelist) == 0: sys.stderr.write('No pages updated.\n') parser.close() return filename = TMP_PAT % os.getpid() outfile = open(filename, 'w') outfile.write('\n\n%s results\n'\ % USER_AGENT) outfile.write('\n\n') outfile.write('

The URL %s has links to the '\ 'following pages that have possibly been modified '\ 'since %s.

\n' % (url, url, datestr(lastchange))) outfile.write('\n') outfile.write('

Generated by %s, by ' \ 'Chris Lawrence <quango@ix.netcom.com>.

\n' \ % (HOME_URL, USER_AGENT)) outfile.write('\n\n') outfile.close() if os.fork(): os.wait() else: os.execvp(WEB_BROWSER, (WEB_BROWSER, filename)) os.remove(filename) parser.close() if __name__ == '__main__': main()