440 lines
14 KiB
Python
Executable File
440 lines
14 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
# Utility to filter a dump file of a Subversion repository to
|
|
# produce a dump file describing only specified subdirectories of
|
|
# the tree contained in the original one. This is similar in
|
|
# concept to the official tool `svndumpfilter', but it's able to
|
|
# cope with revisions which copy files into the area of interest
|
|
# from outside it (in which situation a Node-copyfrom won't be
|
|
# valid in the output dump file). However, in order to support
|
|
# this, svndumpfilter2 requires access via `svnlook' to the
|
|
# original repository from which the input dump file was produced.
|
|
#
|
|
# Usage:
|
|
#
|
|
# svndumpfilter [options] source-repository regexp [regexp...]
|
|
#
|
|
# This command expects to receive a Subversion dump file on
|
|
# standard input, which must correspond to the Subversion
|
|
# repository pointed to by the first argument. It outputs a
|
|
# filtered dump file on standard output.
|
|
#
|
|
# `source-repository': The first argument must be a pathname to a
|
|
# _local_ Subversion repository. That is, it isn't a Subversion URL
|
|
# (beginning with http:// or svn:// or anything else like that);
|
|
# it's a simple local pathname (absolute or relative). A simple
|
|
# test to see if it's a valid pathname is to pass it as an argument
|
|
# to `svnlook tree'. If that succeeds, it's also a valid first
|
|
# argument to svndumpfilter2.
|
|
#
|
|
# `regexp': The remaining arguments are used to select directory
|
|
# names from the top level of the repository's internal directory
|
|
# tree. Any directory matching any of the regexps will be
|
|
# considered `interesting' and copied into the output dump file;
|
|
# any directory not matching will not. Matching is performed at the
|
|
# top level only: it is not currently possible to selectively
|
|
# include a subset of second-level directories with a common
|
|
# parent.
|
|
#
|
|
# Options include:
|
|
#
|
|
# `--drop-empty-revs': Exclude empty revisions from the output.
|
|
#
|
|
# `--renumber-revs': Generated sequential revision numbers in the
|
|
# filtered output. This may help work around issues with certain
|
|
# versions of 'svnadmin load'.
|
|
#
|
|
# For example, this command...
|
|
#
|
|
# svndumpfilter2 /home/svnadmin/myrepos foo bar baz quu+x
|
|
#
|
|
# ... will read a dump file on standard input, and output one on
|
|
# standard output which contains only the subdirectories `foo',
|
|
# `bar', `baz', `quux', `quuux', `quuuux', etc.
|
|
#
|
|
# You will probably usually want to use svndumpfilter2 in
|
|
# conjunction with the production of the dump file in the first
|
|
# place, like this:
|
|
#
|
|
# svnadmin dump /home/svnadmin/myrepos | \
|
|
# svndumpfilter2 /home/svnadmin/myrepos foo bar baz quu+x > msv.dump
|
|
|
|
import sys
|
|
import os
|
|
import re
|
|
import string
|
|
import types
|
|
import md5
|
|
from optparse import OptionParser
|
|
|
|
# Quoting function which should render any string impervious to
|
|
# POSIX shell metacharacter expansion.
|
|
def quote(word):
|
|
return "'" + string.replace(word, "'", "'\\''") + "'"
|
|
|
|
# First, the sensible way to deal with a pathname is to split it
|
|
# into pieces at the slashes and thereafter treat it as a list.
|
|
def splitpath(s):
|
|
list = string.split(s, "/")
|
|
# Simplest way to remove all empty elements!
|
|
try:
|
|
while 1:
|
|
list.remove("")
|
|
except ValueError:
|
|
pass
|
|
return list
|
|
|
|
def joinpath(list, prefix=""):
|
|
return prefix + string.join(list, "/")
|
|
|
|
def cleanpath(s):
|
|
return joinpath(splitpath(s))
|
|
|
|
def catpath(path1, path2, prefix=""):
|
|
return joinpath(splitpath(path1) + splitpath(path2), prefix)
|
|
|
|
# Decide whether a pathname is interesting or not.
|
|
class InterestingPaths:
|
|
def __init__(self, args):
|
|
self.res = []
|
|
for a in args:
|
|
self.res.append(re.compile(a))
|
|
def interesting(self, path):
|
|
path = cleanpath(path)
|
|
if path == '':
|
|
# It's possible that the path may have no elements at
|
|
# all, in which case we can't match on its first
|
|
# element. This generally occurs when svn properties
|
|
# are being changed on the root of the repository; we
|
|
# consider those to be always interesting and never
|
|
# filter them out.
|
|
return 1
|
|
for r in self.res:
|
|
if r.match(path):
|
|
return 1
|
|
return 0
|
|
|
|
# A class and some functions to handle a single lump of
|
|
# RFC822-ish-headers-plus-data read from an SVN dump file.
|
|
|
|
class Lump:
|
|
def __init__(self):
|
|
self.hdrlist = []
|
|
self.hdrdict = {}
|
|
self.prop = ""
|
|
self.text = None
|
|
self.extant = 1
|
|
self.props = [[], {}]
|
|
def sethdr(self, key, val):
|
|
if not self.hdrdict.has_key(key):
|
|
self.hdrlist.append(key)
|
|
self.hdrdict[key] = val
|
|
def delhdr(self, key):
|
|
if self.hdrdict.has_key(key):
|
|
del self.hdrdict[key]
|
|
self.hdrlist.remove(key)
|
|
def propparse(self):
|
|
index = 0
|
|
while 1:
|
|
if self.prop[index:index+2] == "K ":
|
|
wantval = 1
|
|
elif self.prop[index:index+2] == "D ":
|
|
wantval = 0
|
|
elif self.prop[index:index+9] == "PROPS-END":
|
|
break
|
|
else:
|
|
raise "Unrecognised record in props section"
|
|
nlpos = string.find(self.prop, "\n", index)
|
|
assert nlpos > 0
|
|
namelen = string.atoi(self.prop[index+2:nlpos])
|
|
assert self.prop[nlpos+1+namelen] == "\n"
|
|
name = self.prop[nlpos+1:nlpos+1+namelen]
|
|
index = nlpos+2+namelen
|
|
if wantval:
|
|
assert self.prop[index:index+2] == "V "
|
|
nlpos = string.find(self.prop, "\n", index)
|
|
assert nlpos > 0
|
|
proplen = string.atoi(self.prop[index+2:nlpos])
|
|
assert self.prop[nlpos+1+proplen] == "\n"
|
|
prop = self.prop[nlpos+1:nlpos+1+proplen]
|
|
index = nlpos+2+proplen
|
|
else:
|
|
prop = None
|
|
self.props[0].append(name)
|
|
self.props[1][name] = prop
|
|
def setprop(self, key, val):
|
|
if not self.props[1].has_key(key):
|
|
self.props[0].append(key)
|
|
self.props[1][key] = val
|
|
def delprop(self, key):
|
|
if self.props[1].has_key(key):
|
|
del self.props[1][key]
|
|
self.props[0].remove(key)
|
|
def correct_headers(self, revmap):
|
|
# First reconstitute the properties block.
|
|
self.prop = ""
|
|
if (not (self.props is None)) and len(self.props[0]) > 0:
|
|
for key in self.props[0]:
|
|
val = self.props[1][key]
|
|
if val == None:
|
|
self.prop = self.prop + "D %d" % len(key) + "\n" + key + "\n"
|
|
else:
|
|
self.prop = self.prop + "K %d" % len(key) + "\n" + key + "\n"
|
|
self.prop = self.prop + "V %d" % len(val) + "\n" + val + "\n"
|
|
self.prop = self.prop + "PROPS-END\n"
|
|
# Now fix up the content length headers.
|
|
if len(self.prop) > 0:
|
|
self.sethdr("Prop-content-length", str(len(self.prop)))
|
|
else:
|
|
self.delhdr("Prop-content-length")
|
|
# Only fiddle with the md5 if we're not doing a delta.
|
|
if self.hdrdict.get("Text-delta", "false") != "true":
|
|
if self.text != None:
|
|
self.sethdr("Text-content-length", str(len(self.text)))
|
|
m = md5.new()
|
|
m.update(self.text)
|
|
self.sethdr("Text-content-md5", m.hexdigest())
|
|
else:
|
|
self.delhdr("Text-content-length")
|
|
self.delhdr("Text-content-md5")
|
|
if len(self.prop) > 0 or self.text != None:
|
|
if self.text == None:
|
|
textlen = 0
|
|
else:
|
|
textlen = len(self.text)
|
|
self.sethdr("Content-length", str(len(self.prop)+textlen))
|
|
else:
|
|
self.delhdr("Content-length")
|
|
# Adjust the revision numbers as needed.
|
|
for header in ["Revision-number", "Node-copyfrom-rev"]:
|
|
if self.hdrdict.has_key(header):
|
|
old_val = int(self.hdrdict[header])
|
|
if revmap != None:
|
|
new_val = revmap[old_val]
|
|
else:
|
|
new_val = old_val
|
|
self.sethdr(header, str(new_val))
|
|
|
|
def read_rfc822_headers(f):
|
|
ret = Lump()
|
|
while 1:
|
|
s = f.readline()
|
|
if s == "":
|
|
return None # end of file
|
|
if s == "\n":
|
|
if len(ret.hdrlist) > 0:
|
|
break # newline after headers ends them
|
|
else:
|
|
continue # newline before headers is simply ignored
|
|
if s[-1:] == "\n": s = s[:-1]
|
|
colon = string.find(s, ":")
|
|
assert colon > 0
|
|
assert s[colon:colon+2] == ": "
|
|
key = s[:colon]
|
|
val = s[colon+2:]
|
|
ret.sethdr(key, val)
|
|
return ret
|
|
|
|
def read_lump(f):
|
|
lump = read_rfc822_headers(f)
|
|
if lump == None:
|
|
return None
|
|
pcl = string.atoi(lump.hdrdict.get("Prop-content-length", "0"))
|
|
if pcl > 0:
|
|
lump.prop = f.read(pcl)
|
|
lump.propparse()
|
|
if lump.hdrdict.has_key("Text-content-length"):
|
|
tcl = string.atoi(lump.hdrdict["Text-content-length"])
|
|
lump.text = f.read(tcl)
|
|
return lump
|
|
|
|
def write_lump(f, lump, revmap):
|
|
if not lump.extant:
|
|
return
|
|
lump.correct_headers(revmap)
|
|
for key in lump.hdrlist:
|
|
val = lump.hdrdict[key]
|
|
f.write(key + ": " + val + "\n")
|
|
f.write("\n")
|
|
f.write(lump.prop)
|
|
if lump.text != None:
|
|
f.write(lump.text)
|
|
if lump.hdrdict.has_key("Prop-content-length") or \
|
|
lump.hdrdict.has_key("Text-content-length") or \
|
|
lump.hdrdict.has_key("Content-length"):
|
|
f.write("\n")
|
|
|
|
# Higher-level class that makes use of the above to filter dump
|
|
# file fragments a whole revision at a time.
|
|
|
|
class Filter:
|
|
def __init__(self, paths):
|
|
self.revisions = {}
|
|
self.paths = paths
|
|
|
|
def tweak(self, revhdr, contents):
|
|
contents2 = []
|
|
for lump in contents:
|
|
action = lump.hdrdict["Node-action"]
|
|
path = lump.hdrdict["Node-path"]
|
|
|
|
if not self.paths.interesting(path):
|
|
continue # boooring
|
|
|
|
need = 1 # we need to do something about this lump
|
|
|
|
if action == "add":
|
|
if lump.hdrdict.has_key("Node-copyfrom-path"):
|
|
srcrev = string.atoi(lump.hdrdict["Node-copyfrom-rev"])
|
|
srcpath = lump.hdrdict["Node-copyfrom-path"]
|
|
if not self.paths.interesting(srcpath):
|
|
# Copy from a boring path to an interesting
|
|
# one, meaning we must use svnlook to
|
|
# extract the subtree and convert it into
|
|
# lumps.
|
|
treecmd = "svnlook tree -r%d %s %s" % \
|
|
(srcrev, quote(repos), quote(srcpath))
|
|
tree = os.popen(treecmd, "r")
|
|
pathcomponents = []
|
|
while 1:
|
|
treeline = tree.readline()
|
|
if treeline == "": break
|
|
if treeline[-1:] == "\n": treeline = treeline[:-1]
|
|
subdir = 0
|
|
while treeline[-1:] == "/":
|
|
subdir = 1
|
|
treeline = treeline[:-1]
|
|
depth = 0
|
|
while treeline[:1] == " ":
|
|
depth = depth + 1
|
|
treeline = treeline[1:]
|
|
pathcomponents[depth:] = [treeline]
|
|
thissrcpath = string.join([srcpath] + pathcomponents[1:], "/")
|
|
thisdstpath = string.join([path] + pathcomponents[1:], "/")
|
|
newlump = Lump()
|
|
newlump.sethdr("Node-path", thisdstpath)
|
|
newlump.sethdr("Node-action", "add")
|
|
props = os.popen("svnlook pl -r%d %s %s" % \
|
|
(srcrev, quote(repos), quote(thissrcpath)), "r")
|
|
while 1:
|
|
propname = props.readline()
|
|
if propname == "": break
|
|
if propname[-1:] == "\n": propname = propname[:-1]
|
|
while propname[:1] == " ": propname = propname[1:]
|
|
propf = os.popen("svnlook pg -r%d %s %s %s" % \
|
|
(srcrev, quote(repos), quote(propname), quote(thissrcpath)), "r")
|
|
proptext = propf.read()
|
|
propf.close()
|
|
newlump.setprop(propname, proptext)
|
|
props.close()
|
|
if subdir:
|
|
newlump.sethdr("Node-kind", "dir")
|
|
else:
|
|
newlump.sethdr("Node-kind", "file")
|
|
f = os.popen("svnlook cat -r%d %s %s" % \
|
|
(srcrev, quote(repos), quote(thissrcpath)), "r")
|
|
newlump.text = f.read()
|
|
f.close()
|
|
contents2.append(newlump)
|
|
tree.close()
|
|
if lump.text != None:
|
|
# This was a copyfrom _plus_ some sort of
|
|
# delta or new contents, which means that
|
|
# having done the copy we now also need a
|
|
# change record providing the new contents.
|
|
lump.sethdr("Node-action", "change")
|
|
lump.delhdr("Node-copyfrom-rev")
|
|
lump.delhdr("Node-copyfrom-path")
|
|
else:
|
|
need = 0 # we have now done something
|
|
if need:
|
|
contents2.append(lump)
|
|
|
|
# Change the contents array.
|
|
contents[:] = contents2
|
|
|
|
# If we've just removed everything in this revision, leave
|
|
# out some revision properties as well.
|
|
if (len(contents) == 0):
|
|
revhdr.delprop("svn:log")
|
|
revhdr.delprop("svn:author")
|
|
revhdr.delprop("svn:date")
|
|
|
|
fr = sys.stdin
|
|
fw = sys.stdout
|
|
|
|
# Parse our command-line arguments.
|
|
parser = OptionParser(usage="Usage: %prog [options] src-repo regexp...")
|
|
parser.add_option("--drop-empty-revs", action="store_true",
|
|
dest="drop_empty_revs", default=False,
|
|
help="filter empty revisions from the dump")
|
|
parser.add_option("--renumber-revs", action="store_true",
|
|
dest="renumber_revs", default=False,
|
|
help="renumber remaining revisions")
|
|
(options, args) = parser.parse_args()
|
|
if len(args) < 2:
|
|
print >>sys.stderr, sys.argv[0] + ": Too few arguments."
|
|
print >>sys.stderr, parser.usage
|
|
sys.exit(2)
|
|
|
|
repos = args[0]
|
|
paths = InterestingPaths(args[1:])
|
|
|
|
# We use this table to map input revisions to output revisions.
|
|
if options.renumber_revs:
|
|
revmap = {}
|
|
else:
|
|
revmap = None
|
|
|
|
# Pass the dump-file header through unchanged.
|
|
lump = read_lump(fr)
|
|
while not lump.hdrdict.has_key("Revision-number"):
|
|
write_lump(fw, lump, revmap)
|
|
lump = read_lump(fr)
|
|
|
|
revhdr = lump
|
|
|
|
filt = Filter(paths)
|
|
|
|
current_output_rev = 0
|
|
while revhdr != None:
|
|
# Read revision header.
|
|
assert revhdr.hdrdict.has_key("Revision-number")
|
|
contents = []
|
|
# Read revision contents.
|
|
while 1:
|
|
lump = read_lump(fr)
|
|
if lump == None or lump.hdrdict.has_key("Revision-number"):
|
|
newrevhdr = lump
|
|
break
|
|
contents.append(lump)
|
|
|
|
# Alter the contents of the revision.
|
|
filt.tweak(revhdr, contents)
|
|
|
|
# Determine whether we should output this revision. We only
|
|
# update the current_output_rev if we're actually going to write
|
|
# something.
|
|
should_write = (len(contents) > 0 or not options.drop_empty_revs)
|
|
if should_write:
|
|
current_output_rev += 1
|
|
|
|
# Update our revmap with information about this revision. Note that
|
|
# if this revision won't be written, current_output_rev still points
|
|
# to the last version we dumped.
|
|
input_rev = int(revhdr.hdrdict["Revision-number"])
|
|
if revmap != None:
|
|
revmap[input_rev] = current_output_rev
|
|
|
|
# Write out this revision, if that's what we've decided to do.
|
|
if should_write:
|
|
write_lump(fw, revhdr, revmap)
|
|
for lump in contents:
|
|
write_lump(fw, lump, revmap)
|
|
|
|
# And loop round again.
|
|
revhdr = newrevhdr
|
|
|
|
fr.close()
|
|
fw.close()
|