bin-utils/eml2mbox.rb

266 lines
11 KiB
Ruby
Raw Normal View History

2015-05-30 16:42:42 +00:00
#!/usr/bin/ruby
#============================================================================================#
# eml2mbox.rb v0.08 #
# Last updated: Jan 23, 2004 #
# #
# Converts a bunch of eml files into one mbox file. #
# #
# Usage: [ruby] eml2mbx.rb [-c] [-l] [-s] [-yz] [emlpath [trgtmbx]] #
# Switches: #
# -c Remove CRs (^M) appearing at end of lines (Unix) #
# -l Remove LFs appearing at beggining of lines (old Mac) - not tested #
# -s Don't use standard mbox postmark formatting (for From_ line) #
# This will force the use of original From and Date found in mail headers. #
# Not recommended, unless you really have problems importing emls. #
# -yz Use this to force the order of the year and timezone in date in the From_ #
# line from the default [timezone][year] to [year][timezone]. #
# emlpath - Path of dir with eml files. Defaults to the current dir if not specified #
# trgtmbx - Name of the target mbox file. Defaults to "archive.mbox" in 'emlpath' #
# #
# Ruby homepage: http://www.ruby-lang.org/en/ #
# Unix mailbox format: http://www.broobles.com/eml2mbox/mbox.html #
# This script : http://www.broobles.com/eml2mbox #
# #
#============================================================================================#
# Licence: #
# #
# This script is free software; you can redistribute it and/or modify it under the terms of #
# the GNU Lesser General Public License as published by the Free Software Foundation; #
# either version 2.1 of the License, or (at your option) any later version. #
# #
# You should have received a copy of the GNU Lesser General Public License along with this #
# script; if not, please visit http://www.gnu.org/copyleft/gpl.html for more information. #
#============================================================================================#
require "parsedate"
include ParseDate
#=======================================================#
# Class that encapsulates the processing file in memory #
#=======================================================#
class FileInMemory
ZoneOffset = {
# Standard zones by RFC 2822
'UTC' => '0000',
'UT' => '0000', 'GMT' => '0000',
'EST' => '-0500', 'EDT' => '-0400',
'CST' => '-0600', 'CDT' => '-0500',
'MST' => '-0700', 'MDT' => '-0600',
'PST' => '-0800', 'PDT' => '-0700',
}
def initialize()
@lines = Array.new
@counter = 1 # keep the 0 position for the From_ line
@from = nil # from part of the From_ line
@date = nil # date part of the From_ line
end
def addLine(line)
# If the line is a 'false' From line, add a '>' to its beggining
line = line.sub(/From/, '>From') if line =~ /^From/ and @from!=nil
# If the line is the first valid From line, save it (without the line break)
if line =~ /^From:\s.*@/ and @from==nil
@from = line.sub(/From:/,'From')
@from = @from.chop # Remove line break(s)
@from = standardizeFrom(@from) unless $switches["noStandardFromLine"]
end
# Get the date
if $switches["noStandardFromLine"]
# Don't parse the content of the Date header
@date = line.sub(/Date:\s/,'') if line =~ /^Date:\s/ and @date==nil
else
if line =~ /^Date:\s/ and @date==nil
# Parse content of the Date header and convert to the mbox standard for the From_ line
@date = line.sub(/Date:\s/,'')
year, month, day, hour, minute, second, timezone, wday = parsedate(@date)
# Need to convert the timezone from a string to a 4 digit offset
unless timezone =~ /[+|-]\d*/
timezone=ZoneOffset[timezone]
end
time = Time.gm(year,month,day,hour,minute,second)
@date = formMboxDate(time,timezone)
end
end
# Now add the line to the array
line = fixLineEndings(line)
@lines[@counter]=line
@counter+=1
end
# Forms the first line (from + date) and returns all the lines
# Returns all the lines in the file
def getProcessedLines()
if @from != nil
# Add from and date to the first line
if @date==nil
puts "WARN: Failed to extract date. Will use current time in the From_ line"
@date=formMboxDate(Time.now,nil)
end
@lines[0] = @from + " " + @date
@lines[0] = fixLineEndings(@lines[0])
@lines[@counter] = ""
return @lines
end
# else don't return anything
end
# Fixes CR/LFs
def fixLineEndings(line)
line = removeCR(line) if $switches["removeCRs"];
line = removeLF(line) if $switches["removeLFs"];
return line
end
# emls usually have CR+LF (DOS) line endings, Unix uses LF as a line break,
# so there's a hanging CR at the end of the line when viewed on Unix.
# This method will remove the next to the last character from a line
def removeCR(line)
line = line[0..-3]+line[-1..-1] if line[-2]==0xD
return line
end
# Similar to the above. This one is for Macs that use CR as a line break.
# So, remove the last char
def removeLF(line)
line = line[0..-2] if line[-1]==0xA
return line
end
end
#================#
# Helper methods #
#================#
# Converts: 'From "some one <aa@aa.aa>" <aa@aa.aa>' -> 'From aa@aa.aa'
def standardizeFrom(fromLine)
# Get indexes of last "<" and ">" in line
openIndex = fromLine.rindex('<')
closeIndex = fromLine.rindex('>')
if openIndex!=nil and closeIndex!=nil
fromLine = fromLine[0..4]+fromLine[openIndex+1..closeIndex-1]
end
# else leave as it is - it is either already well formed or is invalid
return fromLine
end
# Returns a mbox postmark formatted date.
# If timezone is unknown, it is skipped.
# mbox date format used is described here:
# http://www.broobles.com/eml2mbox/mbox.html
def formMboxDate(time,timezone)
if timezone==nil
return time.strftime("%a %b %d %H:%M:%S %Y")
else
if $switches["zoneYearOrder"]
return time.strftime("%a %b %d %H:%M:%S "+timezone.to_s+" %Y")
else
return time.strftime("%a %b %d %H:%M:%S %Y "+timezone.to_s)
end
end
end
# Extracts all switches from the command line and returns
# a hashmap with valid switch names as keys and booleans as values
# Moves real params to the beggining of the ARGV array
def extractSwitches()
switches = Hash.new(false) # All switches (values) default to false
i=0
while (ARGV[i]=~ /^-/) # while arguments are switches
if ARGV[i]=="-c"
switches["removeCRs"] = true
puts "\nWill fix lines ending with a CR"
elsif ARGV[i]=="-l"
switches["removeLFs"] = true
puts "\nWill fix lines beggining with a LF"
elsif ARGV[i]=="-s"
switches["noStandardFromLine"] = true
puts "\nWill use From and Date from mail headers in From_ line"
elsif ARGV[i]=="-yz"
switches["zoneYearOrder"] = true
puts "\nTimezone will be placed before the year in From_ line"
else
puts "\nUnknown switch: "+ARGV[i]+". Ignoring."
end
i = i+1
end
# Move real arguments to the beggining of the array
ARGV[0] = ARGV[i]
ARGV[1] = ARGV[i+1]
return switches
end
#===============#
# Main #
#===============#
$switches = extractSwitches()
# Extract specified directory with emls and the target archive (if any)
emlDir = "." # default if not specified
emlDir = ARGV[0] if ARGV[0]!=nil
mboxArchive = emlDir+"/archive.mbox" # default if not specified
mboxArchive = ARGV[1] if ARGV[1] != nil
# Show specified settings
puts "\nSpecified dir : "+emlDir
puts "Specified file: "+mboxArchive+"\n"
# Check that the dir exists
if FileTest.directory?(emlDir)
Dir.chdir(emlDir)
else
puts "\n["+emlDir+"] is not a directory (might not exist). Please specify a valid dir"
exit(0)
end
# Check if destination file exists. If yes allow user to select an option.
canceled = false
if FileTest.exist?(mboxArchive)
print "\nFile ["+mboxArchive+"] exists! Please select: [A]ppend [O]verwrite [C]ancel (default) "
sel = STDIN.gets.chomp
if sel == 'A' or sel == 'a'
aFile = File.new(mboxArchive, "a");
elsif sel == 'O' or sel == 'o'
aFile = File.new(mboxArchive, "w");
else
canceled = true
end
else
# File doesn't exist, open for writing
aFile = File.new(mboxArchive, "w");
end
if not canceled
puts
files = Dir["*.eml"]
if files.size == 0
puts "No *.eml files in this directory. mbox file not created."
aFile.close
File.delete(mboxArchive)
exit(0)
end
# For each .eml file in the specified directory do the following
files.each() do |x|
puts "Processing file: "+x
thisFile = FileInMemory.new()
File.open(x).each {|item| thisFile.addLine(item) }
lines = thisFile.getProcessedLines
if lines == nil
puts "WARN: File ["+x+"] doesn't seem to have a regular From: line. Not included in mbox"
else
lines.each {|line| aFile.puts line}
end
end
aFile.close
end