#!/usr/bin/env python # # Extracts email threads or folders into quilt/mq patch series. By default # the patches are listed in a file called patches/series and patch files # are saved into a patches directory # # Example usage (see mseries --help for more options): # mseries '' # finds msg in ~Mail/inbox (either mbox or Maildir) and extracts all the # patches from the thread started by the msgid provided. # mseries -f =linux-kernel '' # same as above but searches ~/Mail/linux-kernel # mseries -f linux-kernel '' # same as above but searches linux-kernel (current directory) # mseries some_folder # pulls out all the threads from some_folder # # Each thread is annotated with a comment line in the series file. If # the thread starts with a 0 of N patch, the subject from that patch is # used in the comment line. # # mseries -o some_folder # forces all the messages found in some_folder into a single thread. # # mseries -f - # reads one patch in stdin # # mseries understands Maildir, mbox and emlx folders. Apparently # emlx can be produced by some OSX email client. mseries -e selects # the emlx format. import mailbox, os.path import email import re import sys import cStringIO import rfc822 import tempfile from optparse import OptionParser VERSION = "mseries version 0.2" force = False # parse out [PATCH X/N] long subject here # also does [PATCH] long subject here subjre = re.compile('(\[.+?((\d+)\s*(\/|of)\s*(\d+))?\])\s+(.+)') class mboxError(Exception): pass # quick and dirty class for the emlx format used by tbird. each # message starts with one line indicating the size in bytes of the content # then some xml crud at the end. This just reads the size and message, # ignoring the xml class emlxfolder: def __init__(self, path, msgid=None): print "found emlx folder %s" % path self.path = path def __iter__(self): for p in os.listdir(self.path): p = os.path.join(self.path, p) print "found %s" % p if not os.path.isfile(p): continue if not p.endswith('.emlx'): continue fp = file(p) size = int(fp.readline().rstrip()) str = fp.read(size) if len(str) != size: raise mboxError("bad file size %s" % p) strfp = cStringIO.StringIO(str) yield rfc822.Message(strfp) # iterate through the rfc822 messages produced by the mailbox classes # and filter out any that correspond to our msg id class mbox: def __init__(self, path, msgid="", emlx=False): self.msgid = msgid self.force = False if os.path.isdir(path): if emlx: self.mbox = emlxfolder(path) else: self.mbox = mailbox.Maildir(path) elif path == '-': fp = tempfile.TemporaryFile() line = sys.stdin.readline() if not line.startswith('From'): fp.write('From unknown') fp.write(line) for x in sys.stdin: fp.write(x) self.mbox = mailbox.PortableUnixMailbox(fp) self.force = True else: if not os.path.exists(path): raise mboxError("%s does not exist" % path) fp = file(path) self.mbox = mailbox.PortableUnixMailbox(fp) def __iter__(self): for m in self.mbox: if not self.msgid: yield m else: id = m.get('Message-ID', "") ref = m.get('References', "") reply = m.get('In-Reply-to', "") if id == self.msgid or self.msgid in ref or reply == self.msgid: yield m # returns a tuple (index, stripped subject, suggested filename) def parse_subject(mbox, msg): subj = msg.get("Subject") if not subj: return None m = subjre.match(subj) if m: short = m.group(6) if short: subj = short short = re.sub('\s', '_', short.lower()) short = re.sub('\W', '', short) idx = m.group(3) if idx == None or (idx == "1" and m.group(5) == "1"): idx = -1 return (idx, subj, short) if mb.force: fr = msg.get("From") ar = fr.split() who = "" for x in ar: i = x.find('@') if i != -1: who = x[:i] + '_' break short = who + subj short = re.sub('\s', '_', short.lower()) short = re.sub('\W', '', short) return (-1, subj, short) return None # pick a thread key for a given message. # If the msg matches the passed msgid, msgid is the key # if the index is <= 0, the msgid is used # otherwise, we try In-Reply-to, then References, then Message-ID # picking the first one present in the headers. # def pick_key(msg, index, msgid): keys = [ 'In-Reply-to', 'References', 'Message-ID' ] if msgid and msg['Message-ID'] == msgid: return msgid if index <= 0: return msg['Message-ID'] for x in keys: v = msg.get(x) if v: return v return "" # write the message into a file named fname in dir. # The From: and Subject: from the message are put into the output file # If the contents of the message have a From:, field # use that instead of the email header. # # If the file already exists, it is not overwritten # def write_message(dir, msg, subj, fname): em = email.message_from_file(msg.fp) fullpath = os.path.join(dir, fname) orig_fullpath = fullpath append = 1 while os.path.exists(fullpath): fullpath = "%s-%d" % (orig_fullpath, append) append += 1 sys.stderr.write("writing %s\n" % fname) f = file(fullpath, 'w') nl = False msgfrom = msg.get('from') s = em.get_payload() s = re.sub('\r\n', '\n', s) for x in s.splitlines(1): if len(x) == 1: break if x.startswith('---'): break if x.startswith('From: '): msgfrom = x[6:] if msgfrom: f.write("From: %s\n" % msgfrom) nl = True if subj: f.write("Subject: %s\n" % subj) nl = True if nl: f.write("\n") f.write(s) # puts a dict of patches into a series file (fname) in a directory # the series file is checked to avoid duplicates. # def create_series(dir, fname, patches): keys = patches.keys() keys.sort() series_file = file(dir + "/" + fname, 'a+') patches_found = {} for x in series_file: x = x.lstrip(' \t') x = x.rstrip(' \t\r\n') if len(x) == 0: continue patches_found[x] = 1 series_file.seek(0, 2) for index in keys: text = patches[index] if index == 0: text = "# " + text elif len(keys) == 1: text = "#\n" + text if text in patches_found: sys.stderr.write("warning: %s already in series\n" % text) continue series_file.write("%s\n" % text) usage = VERSION + "\nusage: %prog [options] [msgid|folder]" parser = OptionParser(usage=usage) parser.add_option("-d", "--directory", help="patch dir (patches)", default="patches") parser.add_option("-f", "--folder", help="folder name (=inbox)", default="=inbox") parser.add_option("-e", "--emlx", help="Use emlx folder format (false)", action="store_true", default=False) parser.add_option("-o", "--one_thread", help="Force single thread (false)", action="store_true", default=False) parser.add_option("-s", "--series", help="series file name (series)", default="series") parser.add_option("-m", "--mail-directory", help="mail location (~/Mail)", default="~/Mail") (options, args) = parser.parse_args() if len(args) > 1: parser.print_help() sys.exit(1) elif len(args) == 0: # use inbox or the folder from -f msgid = "" elif args[0].startswith('<') and args[0].endswith('>'): # msg given on command line msgid = args[0] elif args[0]: # folder name given on command line options.folder = args[0] msgid = "" # if a folder name starts with =, prepend the folder directory into the # path, otherwise the folder name should point to a real file/dir if options.folder.startswith('='): options.folder = options.folder[1:] else: options.mail_directory = "" if not os.path.isdir(options.directory): os.makedirs(options.directory) sys.stderr.write("Creating %s\n" % options.directory) mb = mbox(os.path.join(os.path.expanduser(options.mail_directory), options.folder), msgid, options.emlx) # the series dict matches thread keys to an ordered set of patches series = {} # the filenames dict makes sure we don't duplicate filenames in the patch dir filenames = {} # thread_order records the order in which we discover threads, this way # we create the series file in the same order the mailbox gave us threads thread_order = {} for m in mb: t = parse_subject(mb, m) if t: index, subj, short = t idx = float(index) idx_tmp = idx short_orig = short if options.one_thread: k = 0 else: k = pick_key(m, idx, msgid) d = series.setdefault(k, {}) if len(d) == 0: # first msg for this thread put it into the order dict tid = len(thread_order) thread_order[tid] = k while idx in d: idx += 0.1 if idx == 0: # index 0 is not really a patch, just put it into the dict. d[idx] = subj continue # check for duplicate filenames against other patches found so far while short in filenames: short = "%s-%d" % (short_orig, idx_tmp) idx_tmp = idx_tmp + 1 d[idx] = short filenames[short] = 1 write_message(options.directory, m, subj, short) # finally write out the series file for x in xrange(len(thread_order)): create_series(options.directory, options.series, series[thread_order[x]])