#!/usr/bin/python
#
# harvester -- fetch package updates over the net
#
# By Eric S. Raymond <esr@thyrsus.com>
#

import sys, time, os, string, socket, getopt, urllib, re, smtplib

class watchee:
    def __init__(self, location):
        self.url = location		# what URL are we watching?
        self.suppress_new = None	# If true, don't fetch new URLs
        self.ignores = []		# page ignore expressions

class watcher:
    # This is the regexp used to parse downloadable resources out of pages.
    # The extracted parts are stem, version, build level, and extension
    # That is, if the second %s in each arm is "1"; if it's "0", the
    # extension isn't searched for.
    monster = \
            r"(?:(?:(?:http:)|(?:ftp:))?[A-Za-z0-9+_./-]*/)?" \
            r"([\w+.-]*)-([^/-]+)-([^/-]*)(?:.(?:%s|noarch|src)\.(rpm)){%s}|" \
            r"([\w+._]*)_([^/-]+)-([^/-]*)(?:_(?:%s|all)\.(deb)){%s}|" \
            r"([\w+.-]*)-([^/-]+)-([^/-]*)(?:.(?:%s|noarch|src)\.(tar|tgz|tar\.gz|tar\.Z)){%s}"

    def __init__(self):
        self.rcfile = os.environ["HOME"]+"/.harvester"	# session spec file
        self.localdir = os.environ["HOME"]+"/harvester"	# spool directory
        self.architecture = None		# machine arch to assume
        self.watchlist = []			# pages to watch 

        self.verbosity = 0			# level of verbosity in logs
        self.mailto = None			# mail results to person?
        self.copy = None			# download packages
        self.fetchnew = None;			# Force fetch of new URLs

        self.action_log = []			# Log of actions

    def read_rc(self):
        # Read the run control file
        try:
            fp = open(session.rcfile)
        except:
            session.fatal("Couldn't open run control file " + session.rcfile + ".\n")
            sys.exit(1)
        else:
            while 1:
                line = fp.readline()
                if not line:
                    break
                tokens = string.split(string.strip(line))
                if not tokens:
                    continue
                if tokens[0] == 'watch':
                    if tokens[1][0:5] != "http:" and tokens[1][0:4] != "ftp:":
                        self.log_action("Page type of "+tokens[1]+" is unknown, skipping.\n")
                        continue
                    lastwatch = watchee(tokens[1])
                    self.watchlist.append(lastwatch)
                elif tokens[0] == 'local':
                    self.localdir = tokens[1]
                elif tokens[0] == 'architecture':
                    self.architecture = tokens[1]
                elif tokens[0] == 'ignorenew':
                    if not lastwatch:
                        session.fatal("The `nonew' directive is illegal before the first `watch'")
                        sys.exit(1)
                    else:
                        lastwatch.suppress_new = 1
                elif tokens[0] == 'ignore':
                    if not lastwatch:
                        session.fatal("The `ignore' directive is illegal before the first `watch'")
                        sys.exit(1)
                    else:
                        lastwatch.ignores.append(tokens[1])
        fp.close()
        if not os.path.isdir(self.localdir):
            session.fatal("Local directory " + self.localdir + " does not exist!\n")
            sys.exit(1)
        if self.verbosity >= 3:
            self.log_action("Watchlist: \n")
            for w in self.watchlist:
                self.log_action("    " + w.url + "\n")
            self.log_action("Local directory: " + self.localdir + "\n")

        if not session.architecture:
            try:
                fp = os.popen("arch", "r")
                dflt_arch = string.rstrip(fp.readline())
                fp.close()
                if dflt_arch[0] == "i" and dflt_arch[1] in "123456789":
                    dflt_arch = "i[" + string.digits[1:string.atoi(dflt_arch[1])+1] + "]86"
                self.architecture = dflt_arch
            except:
                self.fatal("you must give an architecture in your rc file.");
                sys.exit(1);
        if self.verbosity >= 1:
            self.log_action("Architecture: " + self.architecture + "\n")

        # Tuple may need to get longer if we add more package types
        self.extprog = self.monster % (self.architecture,"1",
                                       self.architecture,"1",
                                       self.architecture,"1")
        self.extprog = re.compile(self.extprog)
        self.noextprog = self.monster % (self.architecture,"0",
                                       self.architecture,"0",
                                       self.architecture,"0")
        self.noextprog = re.compile(self.noextprog)

    def parse_line(self, str, w, force_ext):
        # Parse package name on given line into a (name, stem,
        # version-build) tuple.  The version and build parts are
        # canonicalized.  Purpose of the canonicalization is to make
        # it possible to do comparisons on version/build with a simple
        # lexicographic comparison (see self.compare_pkgname()).
        str = string.strip(str)
        # if self.verbosity >= 3:
        #    self.log_action("I see: " + str + "\n")
        if w and w.url[0:4] == "ftp:":
            str = str[string.rfind(str, " "):len(str)]
        if force_ext:
            match = self.noextprog.search(str)
        else:
            match = self.extprog.search(str)
        if not match:
            return None
        extracted = match.group(0)
        stem      = match.group(1)
        version   = match.group(2)
        build     = match.group(3)
        extension = match.group(4)
        if self.verbosity >= 3:
            self.log_action(
                "whole: " + extracted \
                + ", stem: " + stem \
                + ", version: " + version \
                + ", build: " + build \
                + ", extension: " + `extension` \
                + "\n")
        if force_ext and not extension:
            extension = force_ext
        if w:
            for regexp in w.ignores:
                if re.search(regexp, match.group(1)):
                    if self.verbosity >= 3:
                        self.log_action("*** Match rejected by `ignore "+ regexp + "'\n")
                    return None
        if w and string.find(extracted, "://") == -1:
            extracted = os.path.join(w.url, extracted)
        # Canonicalize version part.  First, supply 0 for minor
        # version/patchlevel if either is missing.  Then pad resulting
        # fields to length 6 so changes in the length of leading
        # numeric parts get correctly interpreted.  This code can be
        # fooled by version or build fields like "18p7" or "0a16", if
        # the number of digits in the suffix goes up but the first
        # digit decreases.
        version_list = string.split(version, '.')
        if (len(version_list) < 3):
            version_list.append("0")
        if (len(version_list) < 3):
            version_list.append("0")
        version_list = map(lambda x: string.zfill(x, 6), version_list)
        # Build level gets padded to length 6
        return (extracted, stem,
                string.join(version_list,'.')+"-"+string.zfill(build, 6),
                extension)

    def list_installed(self, cmd, fields, suffix):
        # Parse installed packages from the output of the given command.
        try:
            fp = os.popen(cmd, "r")
        except:
            log_action("package-listing" + cmd + " command failed\n")
        if self.verbosity >= 3:
            self.log_action("Parsing output of "+cmd+" package listing...\n")
        while 1:
            line = fp.readline()
            if not line:
                break
            tokens = string.split(string.strip(line))
            if len(tokens) < len(fields):
                continue
            line = string.join(map(lambda x, y=tokens: y[x], fields))
            tuple = self.parse_line(line, None, "suffix")
            if not tuple:
                self.fatal("Monster choked on " + line + "\n")
                sys.exit(1)
            self.installed.append(tuple)
        fp.close()

    def list_fetched(self):
        # Assemble fetched packages from local directory into self.fetched
        self.fetched = filter(lambda x, y=self: os.path.isfile(os.path.join(y.localdir, x)), os.listdir(self.localdir))
        self.fetched = map(lambda f: self.parse_line(f, None, None), self.fetched)
        self.remove_duplicates(self.fetched, "fetched")

    def list_updates(self):
        # Assemble package list from watched pages in self.updates.
        self.updates = []
        self.update_watchee = {}
        for w in self.watchlist:
            if self.verbosity >= 3:
                self.log_action("Retrieving " + w.url + ":\n")
            try:
                (filename, headers) = urllib.urlretrieve(w.url)
            except:
                self.log_action("Couldn't read " + w.url + " (network or server error)\n")
                continue
            if self.verbosity >= 3:
                self.log_action("Filename " + filename + ":\n")
                self.log_action("Headers:\n" + string.join(headers.headers) + "\n")
            try:
                fp = open(filename)
            except:
                self.log_action("Can't open local copy " + filename + "\n")
                sys.exit(1)
            else:
                while 1:
                    line = fp.readline()
                    if not line:
                        break
                    tuple = self.parse_line(line, w, None)
                    if not tuple:
                        continue
                    self.updates.append(tuple)
                    self.update_watchee[tuple[0]] = w
                fp.close()
        self.remove_duplicates(self.updates, "updates")

    def compute_needed(self):
        # Now compare the updates to the installed and fetched lists
        against_installed = {}
        against_fetched = {}
        for url in self.updates:
            for local in self.installed:
                status = self.compare_pkgname(url, local)
                if status:
                    against_installed[url] = (status, local[0])
                    break
            if not against_installed.has_key(url):
                against_installed[url] = (None, None)
            for local in self.fetched:
                status = compare_pkgname(url, local)
                if status:
                    against_fetched[url] = (status, local[0])
            if not against_fetched.has_key(url):
                against_fetched[url] = (None, None)

        # We can deduce the proper action from these two statuses:
        self.needed = []
        for url in self.updates:
            head =  url[0] + ": \n   "
            installed_status = against_installed[url][0]
            fetched_status = against_fetched[url][0]
            installed_version = against_installed[url][1]
            fetched_version = against_fetched[url][1]

            if installed_status == None and fetched_status == None:
                if self.fetchnew or not self.update_watchee[url[0]].suppress_new:
                    if self.verbosity >= 1:
                        self.log_action(head + "UPDATE, no relevant package either installed or fetched.\n")
                    self.needed.append(url)
                else:
                    if self.verbosity >= 1:
                        self.log_action(head + "no relevant package either installed or fetched.\n")
            elif installed_status == None and fetched_status == 'newer':
                if self.verbosity >= 1:
                    self.log_action(head + "UPDATE, newer than " + fetched_version
                           + " which is fetched but not installed.\n")
                self.needed.append(url)
            elif installed_status == None and fetched_status == 'same':
                if self.verbosity >= 1:

                    self.log_action(head + "this version has been fetched but not installed.\n")
            elif installed_status == None and fetched_status == 'older':
                if self.verbosity >= 1:
                    self.log_action(head + "newer version " + fetched_version
                           + " is fetched but not installed.\n")

            elif installed_status == 'newer' and fetched_status == None:
                if self.verbosity >= 1:
                    self.log_action(head + "UPDATE, newer than installed version " + installed_version
                           + " (no relevant fetched versions).\n")
                self.needed.append(url)
            elif installed_status == 'newer' and fetched_status == 'newer':
                self.log_action(head +
                           "UPDATE, newer than both installed version " + installed_version +
                           " and fetched version " + fetched_version + "\n")
                self.needed.append(url)
            elif installed_status == 'newer':	# handles two cases
                if self.verbosity >= 1:
                    self.log_action(head + "supersedes installed but not fetched version.\n")

            elif installed_status == 'same' and fetched_status == None:
                if self.verbosity >= 2:
                    self.log_action(head + "already installed (no relevant fetched versions).\n")
            elif installed_status == 'same' and fetched_status == 'same':
                if self.verbosity >= 2:
                    self.log_action(head+"already installed (fetched version same).\n")
            elif installed_status == 'same':	# handles two cases
                if self.verbosity >= 1:
                    self.log_action(head +
                           "already installed (fetched version " + fetched_version
                           + " is " + fetched_status +").\n")

            elif installed_status == 'older' and fetched_status == None:
                if self.verbosity >= 1:
                    self.log_action(head + "older than installed version " + installed_version +
                " (no relevant fetched versions).\n")
            elif installed_status == 'older':	# handles three cases
                if self.verbosity >= 1:
                    self.log_action(head +
                           "newer version " + installed_version +
                           "already installed (fetched version " + fetched_version +
                           " is " + fetched_status +").\n")
            else:
                self.log_action(head + "internal error!\n")

    def log_action(self, str):
        # Save status messages, also emit them to stderr if verbose is on
        self.action_log.append(str)
        if self.verbosity >= 2:
            sys.stderr.write(str)

    def fatal(self, str):
        # Record a fatal error
        self.action_log.append(sys.argv[0] + ": fatal error - " + str)
        sys.stderr.write(str)

    def compare_pkgname(self, pkg1, pkg2):
        # Return a status token comparing package tuples
        if pkg1[1] != pkg2[1]:
            return None			# Not comparable; different packages
        if pkg1[2] == pkg2[2]:
            return 'same'		# Version/build identical
        elif pkg1[2] > pkg2[2]:
            return 'newer'
        else:
            return 'older'

    def remove_duplicates(self, urls, name):
        # Pare down the given parsed-URL list to newest versions
        deletions = {}
        if self.verbosity >= 2:
            self.log_action("Pruning " + name + " packages...\n")
        for i in range(0, len(urls)):
            for j in range(0, len(urls)):
                status = self.compare_pkgname(urls[i], urls[j])
                if status == 'same' and i < j:
                    if self.verbosity >= 2:
                        self.log_action("Duplicate "+ urls[i][0] + " removed\n")
                    deletions[i] = 1
                elif status == 'newer':
                    if self.verbosity >= 2:
                        self.log_action("Older "+urls[j][0]+" removed ("+urls[i][0]+")\n")
                    deletions[j] = 1
        # Deletions have to be done last-to-first to avoid perturbing indices
        for i in range(0, len(urls)):
            j = len(urls) - 1 - i
            if deletions.has_key(j):
                del urls[j]
        if self.verbosity >= 2:
            self.log_action("Collected " + name + " packages after pruning:\n")
            for url in urls:
                self.log_action("    " \
                                + url[0] \
                                + " (" \
                                + url[1] + ", "
                                + url[2] + ", "
                                + `url[3]`
                                + ")\n")

    def report(self):
        # Report on the need list.
        if self.mailto:
            if self.needed:
                self.log_action("Recommended updates:\n")
            else:
                self.log_action("No recommended updates.\n")
        for update in self.needed:
            if self.copy:
                if self.verbosity >= 2:
                    self.log_action("Downloading " + update[0] + "...\n")
                try:
                    (filename, headers) = urllib.urlretrieve(update[0])
                    os.system("cp " + filename + " " + session.localdir + "/" + os.path.basename(update[0]))
                except:
                    self.log_action("Download of " + os.path.basename(update[0]) + " failed.\n")
            elif self.mailto:
                self.log_action(update[0] + "\n")
            else:
                print update[0]
        if self.verbosity >= 2:
            self.log_action("Done.\n")

        # Maybe we need to mail the action log now
        if self.mailto:
            msg = string.join(self.action_log, '')
            msg = "Subject: " + sys.argv[0] +  " report on new packages\n\n" + msg
            if self.needed:
                if self.copy:
                    msg = msg + "\nThese packages have been downloaded to " + \
                          session.localdir + "\n"
                else:
                    msg = msg + \
                          "\nYou can download these updates " \
                          "by running `harvester -c' from the command line.\n" 
            msg = msg + "--\n\t\t\t\tThe Package Harvester\n"
            try:
                server = smtplib.SMTP("localhost")
            except socket.error, details:
                session.fatal("connect to localhost failed.\n")
                sys.exit(1)            
            hostname = socket.gethostbyaddr(socket.gethostname())[0]
            me = os.environ['USER'] + "@" + hostname
            server.sendmail(me, [self.mailto], msg)
            server.quit()

    def prepare(self):
        # Initialize all the local databases necessary to do a check
        if not self.architecture:
            self.read_rc()
        self.installed = []
        if os.path.exists("/var/lib/rpm"):
            self.list_installed("rpm -qa", (0,), "rpm")
        if os.path.exists("/var/lib/dpkg"):
            self.list_installed("dpkg --list", (0,1,), "deb")
        # if self.verbosity >= 3:
        #     self.log_action("Installed packages:\n"+`self.installed`+"\n")
        self.remove_duplicates(self.installed, "installed")
        self.list_fetched()

    def harvest(self):
        # Gather informatiomn on updates
        self.prepare()
        self.list_updates()
        self.compute_needed()

if __name__ == '__main__':

    session = watcher()

    (options, arguments) = getopt.getopt(sys.argv[1:], "vdf:cm:ln")
    for (switch, val) in options:
        if switch == '-l':
            session.verbosity = 1
        elif switch == '-v':
            session.verbosity = 2
        elif switch == '-d':
            session.verbosity = 3
        elif switch == '-f':
            session.rcfile = val
        elif switch == '-c':
            session.copy = 1
        elif switch == '-m':
            session.mailto = val
        elif switch == '-n':
            session.fetchnew = 1

    session.harvest()
    session.report()

# The following sets edit modes for GNU EMACS
# Local Variables:
# mode:python
# End:
