bin-utils/svndumpfilter2

440 lines
14 KiB
Python
Executable File

#!/usr/bin/env python
# Utility to filter a dump file of a Subversion repository to
# produce a dump file describing only specified subdirectories of
# the tree contained in the original one. This is similar in
# concept to the official tool `svndumpfilter', but it's able to
# cope with revisions which copy files into the area of interest
# from outside it (in which situation a Node-copyfrom won't be
# valid in the output dump file). However, in order to support
# this, svndumpfilter2 requires access via `svnlook' to the
# original repository from which the input dump file was produced.
#
# Usage:
#
# svndumpfilter [options] source-repository regexp [regexp...]
#
# This command expects to receive a Subversion dump file on
# standard input, which must correspond to the Subversion
# repository pointed to by the first argument. It outputs a
# filtered dump file on standard output.
#
# `source-repository': The first argument must be a pathname to a
# _local_ Subversion repository. That is, it isn't a Subversion URL
# (beginning with http:// or svn:// or anything else like that);
# it's a simple local pathname (absolute or relative). A simple
# test to see if it's a valid pathname is to pass it as an argument
# to `svnlook tree'. If that succeeds, it's also a valid first
# argument to svndumpfilter2.
#
# `regexp': The remaining arguments are used to select directory
# names from the top level of the repository's internal directory
# tree. Any directory matching any of the regexps will be
# considered `interesting' and copied into the output dump file;
# any directory not matching will not. Matching is performed at the
# top level only: it is not currently possible to selectively
# include a subset of second-level directories with a common
# parent.
#
# Options include:
#
# `--drop-empty-revs': Exclude empty revisions from the output.
#
# `--renumber-revs': Generated sequential revision numbers in the
# filtered output. This may help work around issues with certain
# versions of 'svnadmin load'.
#
# For example, this command...
#
# svndumpfilter2 /home/svnadmin/myrepos foo bar baz quu+x
#
# ... will read a dump file on standard input, and output one on
# standard output which contains only the subdirectories `foo',
# `bar', `baz', `quux', `quuux', `quuuux', etc.
#
# You will probably usually want to use svndumpfilter2 in
# conjunction with the production of the dump file in the first
# place, like this:
#
# svnadmin dump /home/svnadmin/myrepos | \
# svndumpfilter2 /home/svnadmin/myrepos foo bar baz quu+x > msv.dump
import sys
import os
import re
import string
import types
import md5
from optparse import OptionParser
# Quoting function which should render any string impervious to
# POSIX shell metacharacter expansion.
def quote(word):
return "'" + string.replace(word, "'", "'\\''") + "'"
# First, the sensible way to deal with a pathname is to split it
# into pieces at the slashes and thereafter treat it as a list.
def splitpath(s):
list = string.split(s, "/")
# Simplest way to remove all empty elements!
try:
while 1:
list.remove("")
except ValueError:
pass
return list
def joinpath(list, prefix=""):
return prefix + string.join(list, "/")
def cleanpath(s):
return joinpath(splitpath(s))
def catpath(path1, path2, prefix=""):
return joinpath(splitpath(path1) + splitpath(path2), prefix)
# Decide whether a pathname is interesting or not.
class InterestingPaths:
def __init__(self, args):
self.res = []
for a in args:
self.res.append(re.compile(a))
def interesting(self, path):
path = cleanpath(path)
if path == '':
# It's possible that the path may have no elements at
# all, in which case we can't match on its first
# element. This generally occurs when svn properties
# are being changed on the root of the repository; we
# consider those to be always interesting and never
# filter them out.
return 1
for r in self.res:
if r.match(path):
return 1
return 0
# A class and some functions to handle a single lump of
# RFC822-ish-headers-plus-data read from an SVN dump file.
class Lump:
def __init__(self):
self.hdrlist = []
self.hdrdict = {}
self.prop = ""
self.text = None
self.extant = 1
self.props = [[], {}]
def sethdr(self, key, val):
if not self.hdrdict.has_key(key):
self.hdrlist.append(key)
self.hdrdict[key] = val
def delhdr(self, key):
if self.hdrdict.has_key(key):
del self.hdrdict[key]
self.hdrlist.remove(key)
def propparse(self):
index = 0
while 1:
if self.prop[index:index+2] == "K ":
wantval = 1
elif self.prop[index:index+2] == "D ":
wantval = 0
elif self.prop[index:index+9] == "PROPS-END":
break
else:
raise "Unrecognised record in props section"
nlpos = string.find(self.prop, "\n", index)
assert nlpos > 0
namelen = string.atoi(self.prop[index+2:nlpos])
assert self.prop[nlpos+1+namelen] == "\n"
name = self.prop[nlpos+1:nlpos+1+namelen]
index = nlpos+2+namelen
if wantval:
assert self.prop[index:index+2] == "V "
nlpos = string.find(self.prop, "\n", index)
assert nlpos > 0
proplen = string.atoi(self.prop[index+2:nlpos])
assert self.prop[nlpos+1+proplen] == "\n"
prop = self.prop[nlpos+1:nlpos+1+proplen]
index = nlpos+2+proplen
else:
prop = None
self.props[0].append(name)
self.props[1][name] = prop
def setprop(self, key, val):
if not self.props[1].has_key(key):
self.props[0].append(key)
self.props[1][key] = val
def delprop(self, key):
if self.props[1].has_key(key):
del self.props[1][key]
self.props[0].remove(key)
def correct_headers(self, revmap):
# First reconstitute the properties block.
self.prop = ""
if (not (self.props is None)) and len(self.props[0]) > 0:
for key in self.props[0]:
val = self.props[1][key]
if val == None:
self.prop = self.prop + "D %d" % len(key) + "\n" + key + "\n"
else:
self.prop = self.prop + "K %d" % len(key) + "\n" + key + "\n"
self.prop = self.prop + "V %d" % len(val) + "\n" + val + "\n"
self.prop = self.prop + "PROPS-END\n"
# Now fix up the content length headers.
if len(self.prop) > 0:
self.sethdr("Prop-content-length", str(len(self.prop)))
else:
self.delhdr("Prop-content-length")
# Only fiddle with the md5 if we're not doing a delta.
if self.hdrdict.get("Text-delta", "false") != "true":
if self.text != None:
self.sethdr("Text-content-length", str(len(self.text)))
m = md5.new()
m.update(self.text)
self.sethdr("Text-content-md5", m.hexdigest())
else:
self.delhdr("Text-content-length")
self.delhdr("Text-content-md5")
if len(self.prop) > 0 or self.text != None:
if self.text == None:
textlen = 0
else:
textlen = len(self.text)
self.sethdr("Content-length", str(len(self.prop)+textlen))
else:
self.delhdr("Content-length")
# Adjust the revision numbers as needed.
for header in ["Revision-number", "Node-copyfrom-rev"]:
if self.hdrdict.has_key(header):
old_val = int(self.hdrdict[header])
if revmap != None:
new_val = revmap[old_val]
else:
new_val = old_val
self.sethdr(header, str(new_val))
def read_rfc822_headers(f):
ret = Lump()
while 1:
s = f.readline()
if s == "":
return None # end of file
if s == "\n":
if len(ret.hdrlist) > 0:
break # newline after headers ends them
else:
continue # newline before headers is simply ignored
if s[-1:] == "\n": s = s[:-1]
colon = string.find(s, ":")
assert colon > 0
assert s[colon:colon+2] == ": "
key = s[:colon]
val = s[colon+2:]
ret.sethdr(key, val)
return ret
def read_lump(f):
lump = read_rfc822_headers(f)
if lump == None:
return None
pcl = string.atoi(lump.hdrdict.get("Prop-content-length", "0"))
if pcl > 0:
lump.prop = f.read(pcl)
lump.propparse()
if lump.hdrdict.has_key("Text-content-length"):
tcl = string.atoi(lump.hdrdict["Text-content-length"])
lump.text = f.read(tcl)
return lump
def write_lump(f, lump, revmap):
if not lump.extant:
return
lump.correct_headers(revmap)
for key in lump.hdrlist:
val = lump.hdrdict[key]
f.write(key + ": " + val + "\n")
f.write("\n")
f.write(lump.prop)
if lump.text != None:
f.write(lump.text)
if lump.hdrdict.has_key("Prop-content-length") or \
lump.hdrdict.has_key("Text-content-length") or \
lump.hdrdict.has_key("Content-length"):
f.write("\n")
# Higher-level class that makes use of the above to filter dump
# file fragments a whole revision at a time.
class Filter:
def __init__(self, paths):
self.revisions = {}
self.paths = paths
def tweak(self, revhdr, contents):
contents2 = []
for lump in contents:
action = lump.hdrdict["Node-action"]
path = lump.hdrdict["Node-path"]
if not self.paths.interesting(path):
continue # boooring
need = 1 # we need to do something about this lump
if action == "add":
if lump.hdrdict.has_key("Node-copyfrom-path"):
srcrev = string.atoi(lump.hdrdict["Node-copyfrom-rev"])
srcpath = lump.hdrdict["Node-copyfrom-path"]
if not self.paths.interesting(srcpath):
# Copy from a boring path to an interesting
# one, meaning we must use svnlook to
# extract the subtree and convert it into
# lumps.
treecmd = "svnlook tree -r%d %s %s" % \
(srcrev, quote(repos), quote(srcpath))
tree = os.popen(treecmd, "r")
pathcomponents = []
while 1:
treeline = tree.readline()
if treeline == "": break
if treeline[-1:] == "\n": treeline = treeline[:-1]
subdir = 0
while treeline[-1:] == "/":
subdir = 1
treeline = treeline[:-1]
depth = 0
while treeline[:1] == " ":
depth = depth + 1
treeline = treeline[1:]
pathcomponents[depth:] = [treeline]
thissrcpath = string.join([srcpath] + pathcomponents[1:], "/")
thisdstpath = string.join([path] + pathcomponents[1:], "/")
newlump = Lump()
newlump.sethdr("Node-path", thisdstpath)
newlump.sethdr("Node-action", "add")
props = os.popen("svnlook pl -r%d %s %s" % \
(srcrev, quote(repos), quote(thissrcpath)), "r")
while 1:
propname = props.readline()
if propname == "": break
if propname[-1:] == "\n": propname = propname[:-1]
while propname[:1] == " ": propname = propname[1:]
propf = os.popen("svnlook pg -r%d %s %s %s" % \
(srcrev, quote(repos), quote(propname), quote(thissrcpath)), "r")
proptext = propf.read()
propf.close()
newlump.setprop(propname, proptext)
props.close()
if subdir:
newlump.sethdr("Node-kind", "dir")
else:
newlump.sethdr("Node-kind", "file")
f = os.popen("svnlook cat -r%d %s %s" % \
(srcrev, quote(repos), quote(thissrcpath)), "r")
newlump.text = f.read()
f.close()
contents2.append(newlump)
tree.close()
if lump.text != None:
# This was a copyfrom _plus_ some sort of
# delta or new contents, which means that
# having done the copy we now also need a
# change record providing the new contents.
lump.sethdr("Node-action", "change")
lump.delhdr("Node-copyfrom-rev")
lump.delhdr("Node-copyfrom-path")
else:
need = 0 # we have now done something
if need:
contents2.append(lump)
# Change the contents array.
contents[:] = contents2
# If we've just removed everything in this revision, leave
# out some revision properties as well.
if (len(contents) == 0):
revhdr.delprop("svn:log")
revhdr.delprop("svn:author")
revhdr.delprop("svn:date")
fr = sys.stdin
fw = sys.stdout
# Parse our command-line arguments.
parser = OptionParser(usage="Usage: %prog [options] src-repo regexp...")
parser.add_option("--drop-empty-revs", action="store_true",
dest="drop_empty_revs", default=False,
help="filter empty revisions from the dump")
parser.add_option("--renumber-revs", action="store_true",
dest="renumber_revs", default=False,
help="renumber remaining revisions")
(options, args) = parser.parse_args()
if len(args) < 2:
print >>sys.stderr, sys.argv[0] + ": Too few arguments."
print >>sys.stderr, parser.usage
sys.exit(2)
repos = args[0]
paths = InterestingPaths(args[1:])
# We use this table to map input revisions to output revisions.
if options.renumber_revs:
revmap = {}
else:
revmap = None
# Pass the dump-file header through unchanged.
lump = read_lump(fr)
while not lump.hdrdict.has_key("Revision-number"):
write_lump(fw, lump, revmap)
lump = read_lump(fr)
revhdr = lump
filt = Filter(paths)
current_output_rev = 0
while revhdr != None:
# Read revision header.
assert revhdr.hdrdict.has_key("Revision-number")
contents = []
# Read revision contents.
while 1:
lump = read_lump(fr)
if lump == None or lump.hdrdict.has_key("Revision-number"):
newrevhdr = lump
break
contents.append(lump)
# Alter the contents of the revision.
filt.tweak(revhdr, contents)
# Determine whether we should output this revision. We only
# update the current_output_rev if we're actually going to write
# something.
should_write = (len(contents) > 0 or not options.drop_empty_revs)
if should_write:
current_output_rev += 1
# Update our revmap with information about this revision. Note that
# if this revision won't be written, current_output_rev still points
# to the last version we dumped.
input_rev = int(revhdr.hdrdict["Revision-number"])
if revmap != None:
revmap[input_rev] = current_output_rev
# Write out this revision, if that's what we've decided to do.
if should_write:
write_lump(fw, revhdr, revmap)
for lump in contents:
write_lump(fw, lump, revmap)
# And loop round again.
revhdr = newrevhdr
fr.close()
fw.close()