#!/usr/bin/env python

# Utility to filter a dump file of a Subversion repository to
# produce a dump file describing only specified subdirectories of
# the tree contained in the original one. This is similar in
# concept to the official tool `svndumpfilter', but it's able to
# cope with revisions which copy files into the area of interest
# from outside it (in which situation a Node-copyfrom won't be
# valid in the output dump file). However, in order to support
# this, svndumpfilter2 requires access via `svnlook' to the
# original repository from which the input dump file was produced.
#
# Usage:
#
#     svndumpfilter [options] source-repository regexp [regexp...]
#
# This command expects to receive a Subversion dump file on
# standard input, which must correspond to the Subversion
# repository pointed to by the first argument. It outputs a
# filtered dump file on standard output.
#
# `source-repository': The first argument must be a pathname to a
# _local_ Subversion repository. That is, it isn't a Subversion URL
# (beginning with http:// or svn:// or anything else like that);
# it's a simple local pathname (absolute or relative). A simple
# test to see if it's a valid pathname is to pass it as an argument
# to `svnlook tree'. If that succeeds, it's also a valid first
# argument to svndumpfilter2.
#
# `regexp': The remaining arguments are used to select directory
# names from the top level of the repository's internal directory
# tree. Any directory matching any of the regexps will be
# considered `interesting' and copied into the output dump file;
# any directory not matching will not. Matching is performed at the
# top level only: it is not currently possible to selectively
# include a subset of second-level directories with a common
# parent.
#
# Options include:
#
# `--drop-empty-revs': Exclude empty revisions from the output.
#
# `--renumber-revs': Generated sequential revision numbers in the
# filtered output.  This may help work around issues with certain
# versions of 'svnadmin load'.
#
# For example, this command...
#
#     svndumpfilter2 /home/svnadmin/myrepos foo bar baz quu+x
#
# ... will read a dump file on standard input, and output one on
# standard output which contains only the subdirectories `foo',
# `bar', `baz', `quux', `quuux', `quuuux', etc.
#
# You will probably usually want to use svndumpfilter2 in
# conjunction with the production of the dump file in the first
# place, like this:
#
#     svnadmin dump /home/svnadmin/myrepos | \
#         svndumpfilter2 /home/svnadmin/myrepos foo bar baz quu+x > msv.dump

import sys
import os
import re
import string
import types
import md5
from optparse import OptionParser

# Quoting function which should render any string impervious to
# POSIX shell metacharacter expansion.
def quote(word):
    return "'" + string.replace(word, "'", "'\\''") + "'"

# First, the sensible way to deal with a pathname is to split it
# into pieces at the slashes and thereafter treat it as a list.
def splitpath(s):
    list = string.split(s, "/")
    # Simplest way to remove all empty elements!
    try:
	while 1:
	    list.remove("")
    except ValueError:
	pass
    return list

def joinpath(list, prefix=""):
    return prefix + string.join(list, "/")

def cleanpath(s):
    return joinpath(splitpath(s))

def catpath(path1, path2, prefix=""):
    return joinpath(splitpath(path1) + splitpath(path2), prefix)

# Decide whether a pathname is interesting or not.
class InterestingPaths:
    def __init__(self, args):
	self.res = []
	for a in args:
	    self.res.append(re.compile(a))
    def interesting(self, path):
	path = cleanpath(path)
	if path == '':
	    # It's possible that the path may have no elements at
	    # all, in which case we can't match on its first
	    # element. This generally occurs when svn properties
	    # are being changed on the root of the repository; we
	    # consider those to be always interesting and never
	    # filter them out.
	    return 1
	for r in self.res:
	    if r.match(path):
		return 1
	return 0

# A class and some functions to handle a single lump of
# RFC822-ish-headers-plus-data read from an SVN dump file.

class Lump:
    def __init__(self):
	self.hdrlist = []
	self.hdrdict = {}
	self.prop = ""
	self.text = None
	self.extant = 1
	self.props = [[], {}]
    def sethdr(self, key, val):
	if not self.hdrdict.has_key(key):
	    self.hdrlist.append(key)
	self.hdrdict[key] = val
    def delhdr(self, key):
	if self.hdrdict.has_key(key):
	    del self.hdrdict[key]
	    self.hdrlist.remove(key)
    def propparse(self):
	index = 0
	while 1:
	    if self.prop[index:index+2] == "K ":
		wantval = 1
	    elif self.prop[index:index+2] == "D ":
		wantval = 0
	    elif self.prop[index:index+9] == "PROPS-END":
		break
	    else:
		raise "Unrecognised record in props section"
	    nlpos = string.find(self.prop, "\n", index)
	    assert nlpos > 0
	    namelen = string.atoi(self.prop[index+2:nlpos])
	    assert self.prop[nlpos+1+namelen] == "\n"
	    name = self.prop[nlpos+1:nlpos+1+namelen]
	    index = nlpos+2+namelen
	    if wantval:
		assert self.prop[index:index+2] == "V "
		nlpos = string.find(self.prop, "\n", index)
		assert nlpos > 0
		proplen = string.atoi(self.prop[index+2:nlpos])
		assert self.prop[nlpos+1+proplen] == "\n"
		prop = self.prop[nlpos+1:nlpos+1+proplen]
		index = nlpos+2+proplen
	    else:
		prop = None
	    self.props[0].append(name)
	    self.props[1][name] = prop
    def setprop(self, key, val):
	if not self.props[1].has_key(key):
	    self.props[0].append(key)
	self.props[1][key] = val
    def delprop(self, key):
	if self.props[1].has_key(key):
	    del self.props[1][key]
	    self.props[0].remove(key)
    def correct_headers(self, revmap):
	# First reconstitute the properties block.
	self.prop = ""
	if (not (self.props is None)) and len(self.props[0]) > 0:
	    for key in self.props[0]:
		val = self.props[1][key]
		if val == None:
		    self.prop = self.prop + "D %d" % len(key) + "\n" + key + "\n"
		else:
		    self.prop = self.prop + "K %d" % len(key) + "\n" + key + "\n"
		    self.prop = self.prop + "V %d" % len(val) + "\n" + val + "\n"
	    self.prop = self.prop + "PROPS-END\n"
	# Now fix up the content length headers.
	if len(self.prop) > 0:
	    self.sethdr("Prop-content-length", str(len(self.prop)))
	else:
	    self.delhdr("Prop-content-length")
	# Only fiddle with the md5 if we're not doing a delta.
	if self.hdrdict.get("Text-delta", "false") != "true":
	    if self.text != None:
		self.sethdr("Text-content-length", str(len(self.text)))
		m = md5.new()
		m.update(self.text)
		self.sethdr("Text-content-md5", m.hexdigest())
	    else:
		self.delhdr("Text-content-length")
		self.delhdr("Text-content-md5")
	if len(self.prop) > 0 or self.text != None:
	    if self.text == None:
		textlen = 0
	    else:
		textlen = len(self.text)
	    self.sethdr("Content-length", str(len(self.prop)+textlen))
	else:
	    self.delhdr("Content-length")
	# Adjust the revision numbers as needed.
	for header in ["Revision-number", "Node-copyfrom-rev"]:
	    if self.hdrdict.has_key(header):
		old_val = int(self.hdrdict[header])
                if revmap != None:
                    new_val = revmap[old_val]
                else:
                    new_val = old_val
		self.sethdr(header, str(new_val))

def read_rfc822_headers(f):
    ret = Lump()
    while 1:
	s = f.readline()
	if s == "":
	    return None # end of file
	if s == "\n":
	    if len(ret.hdrlist) > 0:
		break # newline after headers ends them
	    else:
		continue # newline before headers is simply ignored
	if s[-1:] == "\n": s = s[:-1]
	colon = string.find(s, ":")
	assert colon > 0
	assert s[colon:colon+2] == ": "
	key = s[:colon]
	val = s[colon+2:]
	ret.sethdr(key, val)
    return ret

def read_lump(f):
    lump = read_rfc822_headers(f)
    if lump == None:
	return None
    pcl = string.atoi(lump.hdrdict.get("Prop-content-length", "0"))
    if pcl > 0:
	lump.prop = f.read(pcl)
	lump.propparse()
    if lump.hdrdict.has_key("Text-content-length"):
	tcl = string.atoi(lump.hdrdict["Text-content-length"])
	lump.text = f.read(tcl)
    return lump

def write_lump(f, lump, revmap):
    if not lump.extant:
	return
    lump.correct_headers(revmap)
    for key in lump.hdrlist:
	val = lump.hdrdict[key]
	f.write(key + ": " + val + "\n")
    f.write("\n")
    f.write(lump.prop)
    if lump.text != None:
	f.write(lump.text)
    if lump.hdrdict.has_key("Prop-content-length") or \
    lump.hdrdict.has_key("Text-content-length") or \
    lump.hdrdict.has_key("Content-length"):
	f.write("\n")

# Higher-level class that makes use of the above to filter dump
# file fragments a whole revision at a time.

class Filter:
    def __init__(self, paths):
	self.revisions = {}
	self.paths = paths

    def tweak(self, revhdr, contents):
	contents2 = []
	for lump in contents:
	    action = lump.hdrdict["Node-action"]
	    path = lump.hdrdict["Node-path"]

	    if not self.paths.interesting(path):
		continue # boooring

	    need = 1 # we need to do something about this lump

	    if action == "add":
		if lump.hdrdict.has_key("Node-copyfrom-path"):
		    srcrev = string.atoi(lump.hdrdict["Node-copyfrom-rev"])
		    srcpath = lump.hdrdict["Node-copyfrom-path"]
		    if not self.paths.interesting(srcpath):
			# Copy from a boring path to an interesting
			# one, meaning we must use svnlook to
			# extract the subtree and convert it into
			# lumps.
			treecmd = "svnlook tree -r%d %s %s" % \
			(srcrev, quote(repos), quote(srcpath))
			tree = os.popen(treecmd, "r")
			pathcomponents = []
			while 1:
			    treeline = tree.readline()
			    if treeline == "": break
			    if treeline[-1:] == "\n": treeline = treeline[:-1]
			    subdir = 0
			    while treeline[-1:] == "/":
				subdir = 1
				treeline = treeline[:-1]
			    depth = 0
			    while treeline[:1] == " ":
				depth = depth + 1
				treeline = treeline[1:]
			    pathcomponents[depth:] = [treeline]
			    thissrcpath = string.join([srcpath] + pathcomponents[1:], "/")
			    thisdstpath = string.join([path] + pathcomponents[1:], "/")
			    newlump = Lump()
			    newlump.sethdr("Node-path", thisdstpath)
			    newlump.sethdr("Node-action", "add")
			    props = os.popen("svnlook pl -r%d %s %s" % \
			    (srcrev, quote(repos), quote(thissrcpath)), "r")
			    while 1:
				propname = props.readline()
				if propname == "": break
				if propname[-1:] == "\n": propname = propname[:-1]
				while propname[:1] == " ": propname = propname[1:]
				propf = os.popen("svnlook pg -r%d %s %s %s" % \
				(srcrev, quote(repos), quote(propname), quote(thissrcpath)), "r")
				proptext = propf.read()
				propf.close()
				newlump.setprop(propname, proptext)
			    props.close()
			    if subdir:
				newlump.sethdr("Node-kind", "dir")
			    else:
				newlump.sethdr("Node-kind", "file")
				f = os.popen("svnlook cat -r%d %s %s" % \
				(srcrev, quote(repos), quote(thissrcpath)), "r")
				newlump.text = f.read()
				f.close()
			    contents2.append(newlump)
			tree.close()
			if lump.text != None:
			    # This was a copyfrom _plus_ some sort of
			    # delta or new contents, which means that
			    # having done the copy we now also need a
			    # change record providing the new contents.
			    lump.sethdr("Node-action", "change")
			    lump.delhdr("Node-copyfrom-rev")
			    lump.delhdr("Node-copyfrom-path")
			else:
			    need = 0 # we have now done something
	    if need:
		contents2.append(lump)

	# Change the contents array.
	contents[:] = contents2

	# If we've just removed everything in this revision, leave
	# out some revision properties as well.
	if (len(contents) == 0):
	    revhdr.delprop("svn:log")
	    revhdr.delprop("svn:author")
	    revhdr.delprop("svn:date")

fr = sys.stdin
fw = sys.stdout

# Parse our command-line arguments.
parser = OptionParser(usage="Usage: %prog [options] src-repo regexp...")
parser.add_option("--drop-empty-revs", action="store_true",
                  dest="drop_empty_revs", default=False,
                  help="filter empty revisions from the dump")
parser.add_option("--renumber-revs", action="store_true",
                  dest="renumber_revs", default=False,
                  help="renumber remaining revisions")
(options, args) = parser.parse_args()
if len(args) < 2:
    print >>sys.stderr, sys.argv[0] + ": Too few arguments."
    print >>sys.stderr, parser.usage
    sys.exit(2)

repos = args[0]
paths = InterestingPaths(args[1:])

# We use this table to map input revisions to output revisions.
if options.renumber_revs:
    revmap = {}
else:
    revmap = None

# Pass the dump-file header through unchanged.
lump = read_lump(fr)
while not lump.hdrdict.has_key("Revision-number"):
    write_lump(fw, lump, revmap)
    lump = read_lump(fr)

revhdr = lump

filt = Filter(paths)

current_output_rev = 0
while revhdr != None:
    # Read revision header.
    assert revhdr.hdrdict.has_key("Revision-number")
    contents = []
    # Read revision contents.
    while 1:
	lump = read_lump(fr)
	if lump == None or lump.hdrdict.has_key("Revision-number"):
	    newrevhdr = lump
	    break
	contents.append(lump)

    # Alter the contents of the revision.
    filt.tweak(revhdr, contents)

    # Determine whether we should output this revision.  We only
    # update the current_output_rev if we're actually going to write
    # something.
    should_write = (len(contents) > 0 or not options.drop_empty_revs)
    if should_write:
	current_output_rev += 1

    # Update our revmap with information about this revision.  Note that
    # if this revision won't be written, current_output_rev still points
    # to the last version we dumped.
    input_rev = int(revhdr.hdrdict["Revision-number"])
    if revmap != None:
	revmap[input_rev] = current_output_rev

    # Write out this revision, if that's what we've decided to do.
    if should_write:
	write_lump(fw, revhdr, revmap)
	for lump in contents:
	    write_lump(fw, lump, revmap)

    # And loop round again.
    revhdr = newrevhdr

fr.close()
fw.close()