#!/usr/bin/env python # Utility to filter a dump file of a Subversion repository to # produce a dump file describing only specified subdirectories of # the tree contained in the original one. This is similar in # concept to the official tool `svndumpfilter', but it's able to # cope with revisions which copy files into the area of interest # from outside it (in which situation a Node-copyfrom won't be # valid in the output dump file). However, in order to support # this, svndumpfilter2 requires access via `svnlook' to the # original repository from which the input dump file was produced. # # Usage: # # svndumpfilter [options] source-repository regexp [regexp...] # # This command expects to receive a Subversion dump file on # standard input, which must correspond to the Subversion # repository pointed to by the first argument. It outputs a # filtered dump file on standard output. # # `source-repository': The first argument must be a pathname to a # _local_ Subversion repository. That is, it isn't a Subversion URL # (beginning with http:// or svn:// or anything else like that); # it's a simple local pathname (absolute or relative). A simple # test to see if it's a valid pathname is to pass it as an argument # to `svnlook tree'. If that succeeds, it's also a valid first # argument to svndumpfilter2. # # `regexp': The remaining arguments are used to select directory # names from the top level of the repository's internal directory # tree. Any directory matching any of the regexps will be # considered `interesting' and copied into the output dump file; # any directory not matching will not. Matching is performed at the # top level only: it is not currently possible to selectively # include a subset of second-level directories with a common # parent. # # Options include: # # `--drop-empty-revs': Exclude empty revisions from the output. # # `--renumber-revs': Generated sequential revision numbers in the # filtered output. This may help work around issues with certain # versions of 'svnadmin load'. # # For example, this command... # # svndumpfilter2 /home/svnadmin/myrepos foo bar baz quu+x # # ... will read a dump file on standard input, and output one on # standard output which contains only the subdirectories `foo', # `bar', `baz', `quux', `quuux', `quuuux', etc. # # You will probably usually want to use svndumpfilter2 in # conjunction with the production of the dump file in the first # place, like this: # # svnadmin dump /home/svnadmin/myrepos | \ # svndumpfilter2 /home/svnadmin/myrepos foo bar baz quu+x > msv.dump import sys import os import re import string import types import md5 from optparse import OptionParser # Quoting function which should render any string impervious to # POSIX shell metacharacter expansion. def quote(word): return "'" + string.replace(word, "'", "'\\''") + "'" # First, the sensible way to deal with a pathname is to split it # into pieces at the slashes and thereafter treat it as a list. def splitpath(s): list = string.split(s, "/") # Simplest way to remove all empty elements! try: while 1: list.remove("") except ValueError: pass return list def joinpath(list, prefix=""): return prefix + string.join(list, "/") def cleanpath(s): return joinpath(splitpath(s)) def catpath(path1, path2, prefix=""): return joinpath(splitpath(path1) + splitpath(path2), prefix) # Decide whether a pathname is interesting or not. class InterestingPaths: def __init__(self, args): self.res = [] for a in args: self.res.append(re.compile(a)) def interesting(self, path): path = cleanpath(path) if path == '': # It's possible that the path may have no elements at # all, in which case we can't match on its first # element. This generally occurs when svn properties # are being changed on the root of the repository; we # consider those to be always interesting and never # filter them out. return 1 for r in self.res: if r.match(path): return 1 return 0 # A class and some functions to handle a single lump of # RFC822-ish-headers-plus-data read from an SVN dump file. class Lump: def __init__(self): self.hdrlist = [] self.hdrdict = {} self.prop = "" self.text = None self.extant = 1 self.props = [[], {}] def sethdr(self, key, val): if not self.hdrdict.has_key(key): self.hdrlist.append(key) self.hdrdict[key] = val def delhdr(self, key): if self.hdrdict.has_key(key): del self.hdrdict[key] self.hdrlist.remove(key) def propparse(self): index = 0 while 1: if self.prop[index:index+2] == "K ": wantval = 1 elif self.prop[index:index+2] == "D ": wantval = 0 elif self.prop[index:index+9] == "PROPS-END": break else: raise "Unrecognised record in props section" nlpos = string.find(self.prop, "\n", index) assert nlpos > 0 namelen = string.atoi(self.prop[index+2:nlpos]) assert self.prop[nlpos+1+namelen] == "\n" name = self.prop[nlpos+1:nlpos+1+namelen] index = nlpos+2+namelen if wantval: assert self.prop[index:index+2] == "V " nlpos = string.find(self.prop, "\n", index) assert nlpos > 0 proplen = string.atoi(self.prop[index+2:nlpos]) assert self.prop[nlpos+1+proplen] == "\n" prop = self.prop[nlpos+1:nlpos+1+proplen] index = nlpos+2+proplen else: prop = None self.props[0].append(name) self.props[1][name] = prop def setprop(self, key, val): if not self.props[1].has_key(key): self.props[0].append(key) self.props[1][key] = val def delprop(self, key): if self.props[1].has_key(key): del self.props[1][key] self.props[0].remove(key) def correct_headers(self, revmap): # First reconstitute the properties block. self.prop = "" if (not (self.props is None)) and len(self.props[0]) > 0: for key in self.props[0]: val = self.props[1][key] if val == None: self.prop = self.prop + "D %d" % len(key) + "\n" + key + "\n" else: self.prop = self.prop + "K %d" % len(key) + "\n" + key + "\n" self.prop = self.prop + "V %d" % len(val) + "\n" + val + "\n" self.prop = self.prop + "PROPS-END\n" # Now fix up the content length headers. if len(self.prop) > 0: self.sethdr("Prop-content-length", str(len(self.prop))) else: self.delhdr("Prop-content-length") # Only fiddle with the md5 if we're not doing a delta. if self.hdrdict.get("Text-delta", "false") != "true": if self.text != None: self.sethdr("Text-content-length", str(len(self.text))) m = md5.new() m.update(self.text) self.sethdr("Text-content-md5", m.hexdigest()) else: self.delhdr("Text-content-length") self.delhdr("Text-content-md5") if len(self.prop) > 0 or self.text != None: if self.text == None: textlen = 0 else: textlen = len(self.text) self.sethdr("Content-length", str(len(self.prop)+textlen)) else: self.delhdr("Content-length") # Adjust the revision numbers as needed. for header in ["Revision-number", "Node-copyfrom-rev"]: if self.hdrdict.has_key(header): old_val = int(self.hdrdict[header]) if revmap != None: new_val = revmap[old_val] else: new_val = old_val self.sethdr(header, str(new_val)) def read_rfc822_headers(f): ret = Lump() while 1: s = f.readline() if s == "": return None # end of file if s == "\n": if len(ret.hdrlist) > 0: break # newline after headers ends them else: continue # newline before headers is simply ignored if s[-1:] == "\n": s = s[:-1] colon = string.find(s, ":") assert colon > 0 assert s[colon:colon+2] == ": " key = s[:colon] val = s[colon+2:] ret.sethdr(key, val) return ret def read_lump(f): lump = read_rfc822_headers(f) if lump == None: return None pcl = string.atoi(lump.hdrdict.get("Prop-content-length", "0")) if pcl > 0: lump.prop = f.read(pcl) lump.propparse() if lump.hdrdict.has_key("Text-content-length"): tcl = string.atoi(lump.hdrdict["Text-content-length"]) lump.text = f.read(tcl) return lump def write_lump(f, lump, revmap): if not lump.extant: return lump.correct_headers(revmap) for key in lump.hdrlist: val = lump.hdrdict[key] f.write(key + ": " + val + "\n") f.write("\n") f.write(lump.prop) if lump.text != None: f.write(lump.text) if lump.hdrdict.has_key("Prop-content-length") or \ lump.hdrdict.has_key("Text-content-length") or \ lump.hdrdict.has_key("Content-length"): f.write("\n") # Higher-level class that makes use of the above to filter dump # file fragments a whole revision at a time. class Filter: def __init__(self, paths): self.revisions = {} self.paths = paths def tweak(self, revhdr, contents): contents2 = [] for lump in contents: action = lump.hdrdict["Node-action"] path = lump.hdrdict["Node-path"] if not self.paths.interesting(path): continue # boooring need = 1 # we need to do something about this lump if action == "add": if lump.hdrdict.has_key("Node-copyfrom-path"): srcrev = string.atoi(lump.hdrdict["Node-copyfrom-rev"]) srcpath = lump.hdrdict["Node-copyfrom-path"] if not self.paths.interesting(srcpath): # Copy from a boring path to an interesting # one, meaning we must use svnlook to # extract the subtree and convert it into # lumps. treecmd = "svnlook tree -r%d %s %s" % \ (srcrev, quote(repos), quote(srcpath)) tree = os.popen(treecmd, "r") pathcomponents = [] while 1: treeline = tree.readline() if treeline == "": break if treeline[-1:] == "\n": treeline = treeline[:-1] subdir = 0 while treeline[-1:] == "/": subdir = 1 treeline = treeline[:-1] depth = 0 while treeline[:1] == " ": depth = depth + 1 treeline = treeline[1:] pathcomponents[depth:] = [treeline] thissrcpath = string.join([srcpath] + pathcomponents[1:], "/") thisdstpath = string.join([path] + pathcomponents[1:], "/") newlump = Lump() newlump.sethdr("Node-path", thisdstpath) newlump.sethdr("Node-action", "add") props = os.popen("svnlook pl -r%d %s %s" % \ (srcrev, quote(repos), quote(thissrcpath)), "r") while 1: propname = props.readline() if propname == "": break if propname[-1:] == "\n": propname = propname[:-1] while propname[:1] == " ": propname = propname[1:] propf = os.popen("svnlook pg -r%d %s %s %s" % \ (srcrev, quote(repos), quote(propname), quote(thissrcpath)), "r") proptext = propf.read() propf.close() newlump.setprop(propname, proptext) props.close() if subdir: newlump.sethdr("Node-kind", "dir") else: newlump.sethdr("Node-kind", "file") f = os.popen("svnlook cat -r%d %s %s" % \ (srcrev, quote(repos), quote(thissrcpath)), "r") newlump.text = f.read() f.close() contents2.append(newlump) tree.close() if lump.text != None: # This was a copyfrom _plus_ some sort of # delta or new contents, which means that # having done the copy we now also need a # change record providing the new contents. lump.sethdr("Node-action", "change") lump.delhdr("Node-copyfrom-rev") lump.delhdr("Node-copyfrom-path") else: need = 0 # we have now done something if need: contents2.append(lump) # Change the contents array. contents[:] = contents2 # If we've just removed everything in this revision, leave # out some revision properties as well. if (len(contents) == 0): revhdr.delprop("svn:log") revhdr.delprop("svn:author") revhdr.delprop("svn:date") fr = sys.stdin fw = sys.stdout # Parse our command-line arguments. parser = OptionParser(usage="Usage: %prog [options] src-repo regexp...") parser.add_option("--drop-empty-revs", action="store_true", dest="drop_empty_revs", default=False, help="filter empty revisions from the dump") parser.add_option("--renumber-revs", action="store_true", dest="renumber_revs", default=False, help="renumber remaining revisions") (options, args) = parser.parse_args() if len(args) < 2: print >>sys.stderr, sys.argv[0] + ": Too few arguments." print >>sys.stderr, parser.usage sys.exit(2) repos = args[0] paths = InterestingPaths(args[1:]) # We use this table to map input revisions to output revisions. if options.renumber_revs: revmap = {} else: revmap = None # Pass the dump-file header through unchanged. lump = read_lump(fr) while not lump.hdrdict.has_key("Revision-number"): write_lump(fw, lump, revmap) lump = read_lump(fr) revhdr = lump filt = Filter(paths) current_output_rev = 0 while revhdr != None: # Read revision header. assert revhdr.hdrdict.has_key("Revision-number") contents = [] # Read revision contents. while 1: lump = read_lump(fr) if lump == None or lump.hdrdict.has_key("Revision-number"): newrevhdr = lump break contents.append(lump) # Alter the contents of the revision. filt.tweak(revhdr, contents) # Determine whether we should output this revision. We only # update the current_output_rev if we're actually going to write # something. should_write = (len(contents) > 0 or not options.drop_empty_revs) if should_write: current_output_rev += 1 # Update our revmap with information about this revision. Note that # if this revision won't be written, current_output_rev still points # to the last version we dumped. input_rev = int(revhdr.hdrdict["Revision-number"]) if revmap != None: revmap[input_rev] = current_output_rev # Write out this revision, if that's what we've decided to do. if should_write: write_lump(fw, revhdr, revmap) for lump in contents: write_lump(fw, lump, revmap) # And loop round again. revhdr = newrevhdr fr.close() fw.close()