BackupScript

For backups I have a dedicated hard drive on my server, I have a number of jobs that pull data from my various machines using rsync (via ssh) and store it on that drive. based on information from http://www.mikerubel.org/computers/rsync_snapshots/ I use hard links to minimise the diskspace used allowing me to have several months work of backups.

Download latest gkbackup

The code is available in a git repo at http://git.trollgod.org.uk/?p=gkbackup.git;a=summary.

#!/usr/bin/env python
"""Backup program using rsync and hardlinks.

Copyright (c) 2005-2009 Ghworg

This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 2 of the License, or (at your
option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
"""
__version__ = 1
__revision__ = 3
buildno = 152
verString = "gkbackup %d.%d.%d" % (__version__, __revision__, buildno)

# Delay importing os and shutil till after options parsing
import calendar, datetime, logging, optparse, re, sys, time

try:
    from iniparse.compat import ConfigParser
except ImportError:
    from ConfigParser import ConfigParser

try:
    import posix1e
    aclSupport = True
except ImportError:
    aclSupport = False
try:
    import xattr
    xattrSupport = True
except ImportError:
    xattrSupport = False

   
class CustomFormatter(logging.Formatter):
    """Custom log message formatter to add Job name to every log message"""
    def __init__(self, format, timeFormat):
        logging.Formatter.__init__(self, format, timeFormat)
       
    def format(self, record):
        """
        "
""
        try:
            if hasattr(options, ‘jobname’):
                record.jobname = options.jobname
        except NameError:
            record.jobname =
        return logging.Formatter.format(self, record)

   
_logProgressCache = {}

def logProgress(output, level=logging.INFO):
    """Print data captured from external command to logs"""
    cache = _logProgressCache.get(level, )
    for line in output.splitlines(True):
        cache += line.strip(\r\n)
        if \n in line and len(cache) > 0:
            logging.getLogger().log(level, cache)
            cache =
    _logProgressCache[level] = cache

def pingrsync(name, source):
    """Look for client machine using rsync"""
    logging.debug(‘Looking for %s’ % name)
    try:
        for connectType in [‘::’, ‘:’]:
            if connectType in source:
                lines = getCommandOutput(‘rsync %s’ % (source))
                number = len(lines.splitlines())
                logging.info(‘%s found (responded to rsync request)’ % name)
                if number > 1:
                    return(True)
                else:
                    logging.info(‘Nothing to backup (empty source dir)’)
    except RuntimeError, error:
        logProgress(error.args[0], logging.DEBUG)
    return(False)

def mount(mountpoint):
    """Mount the drive at mountpoint.  Drive must be in fstab."""
    if options.custommount:
        mountCmd = options.custommount
    else:
        mountCmd = ‘mount’
   
    if os.getuid() == 0 or options.custommount:
        logging.debug(‘Mounting at %s’ % mountpoint)
        try:
            getCommandOutput(‘%s %s’ % (mountCmd, mountpoint))
            logging.info(‘Mounted drive at %s’ % mountpoint)
            return True
        except RuntimeError, error:
            logProgress(error.args[0], logging.DEBUG)
    return(False)

def umount(mountpoint):
    """Unmount the drive at mountpoint.  Drive must be in fstab."""
    if options.customumount:
        umountCmd = options.customumount
    else:
        umountCmd = ‘umount’
   
    if os.getuid() == 0 or options.customumount:
        logging.debug(‘Unmounting %s’ % mountpoint)
        try:
            getCommandOutput(‘%s %s’ % (umountCmd, mountpoint))
            logging.info(‘Unmounted drive at %s’ % mountpoint)
            return True
        except RuntimeError, error:
            logProgress(error.args[0], logging.DEBUG)
    return(False)

def sshagent(source):
    """Run ssh-add if needed and requested"""
    if (‘:’ in source) and (‘::’ not in source) and (options.sshkey):
        logging.debug(‘Adding backup ssh key to ssh-agent’)
        sshadd = getCommandOutput(‘ssh-add %s’ % options.sshkey)
        logProgress(sshadd, logging.DEBUG)

def getDateDiffs(oldDate, newDate):
    """Return the difference in years, months and days between two dates"""
    monthlengths = [ 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 ]
    if calendar.isleap(oldDate.year):
        monthlengths[1] += 1
    yeardiff  = newDate.year  – oldDate.year
    monthdiff = newDate.month – oldDate.month
    daydiff   = newDate.day   – oldDate.day
    if daydiff < 0:
        monthdiff -= 1
        daydiff += monthlengths[oldDate.month-1]
    if monthdiff < 0:
        yeardiff -= 1
        monthdiff += 12
    return yeardiff, monthdiff, daydiff

def genNewname(adir):
    """Generate a new name based on the age of the dir.
   
    Bias the age by an hour to make sure yesterday’s backup gets
    moved along even if the job runs a few minutes before the
    previous days.
    "
""
    oldDate = datetime.datetime.fromtimestamp(adir[0])
    newDate = datetime.datetime.fromtimestamp(now + 3600)
    yeardiff, monthdiff, daydiff = getDateDiffs(oldDate, newDate)
    if yeardiff > 0:
        return ‘yearly.%d’ % yeardiff
    elif monthdiff > 0:
        return ‘monthly.%d’ % monthdiff
    elif daydiff >= 7:
        return ‘weekly.%d’ % int(daydiff / 7)
    else:
        return ‘daily.%d’ % daydiff

def filterdir(dirname):
    """Filter out all files/dirs that aren’t backups"""
    if (re.match(‘(daily|weekly|monthly|yearly)\.[0-9]+$’, dirname)):
        return True
    return False

def addTimestamp(entry):
    """Get a timestamp for the dir and return a tuple"""
    timestamp = os.path.getmtime( os.path.join(options.destdir, entry) )
    return timestamp, entry

def listOldBackups():
    """Return a list of previous backups, sorted by age"""
    dirlist = filter(filterdir, os.listdir(options.destdir))
    entrylist = map(addTimestamp, dirlist)
    entrylist.sort()
    return entrylist

def renameByAge():
    """Rename dirs by pattern $prefix.$age"""
    logging.debug(‘Renaming old backups’)

    for adir in listOldBackups():
        oldname = adir[1]
        oldpath = os.path.join(options.destdir, oldname)
        ageInDays = (now – adir[0]) / 86400
       
        if ageInDays > options.maxage:
            logging.info(‘Deleting %s (%d days old)’ % (oldname, ageInDays))
            shutil.rmtree(oldpath)
            continue
        newname = genNewname(adir)
        if oldname == newname:
            logging.info(‘Skipping %s (%d days old)’ % (oldname, ageInDays))
            continue
        logging.info(‘Moving %s to %s’ % (oldname, newname))
        newpath = os.path.join(options.destdir, newname)
        if os.path.exists(newpath):
            logging.info(‘%s already exists, deleting %s’ % (newname, oldname))
            shutil.rmtree(oldpath)
        else:
            os.rename(oldpath, newpath)

def findnewest():
    """Find most recent backup to use as src"""
    entrylist = listOldBackups()
    if len(entrylist) > 0:
        return entrylist[len(entrylist)-1][1]
    else:
        return None

def getSrcAndDstPaths(lsrc, ldst, root, name):
    """Generate full src and dst paths from walk components"""
    sourcepath = os.path.join(root, name)
    destpath = os.path.join(ldst, sourcepath[len(lsrc)+1:])
    return sourcepath, destpath

def clonedir(srcdir, dstdir):
    """Copy the owner, perms and timestamps of a dir"""
    sdat = os.stat(srcdir)
    os.mkdir(dstdir)
    os.chown(dstdir, sdat.st_uid, sdat.st_gid)
    if aclSupport:
        acl1 = posix1e.ACL(file=srcdir)
        acl1.applyto(dstdir)
    else:
        os.chmod(dstdir, sdat.st_mode)
    if xattrSupport:
        for attrName in xattr.listxattr(srcdir):
            attrVal = xattr.getxattr(srcdir, attrName)
            xattr.setxattr(dstdir, attrName, attrVal)
    os.utime(dstdir, (sdat.st_atime, sdat.st_mtime) )

def link(parentDir):
    """Create hardlinks from the newest dir to daily.0"""
    newestDir = findnewest()
    linkdst = os.path.join(parentDir, options.latestname)
    if os.path.exists(linkdst) or newestDir==None:
        logging.warning(‘Nothing to link’)
        return
    lsrc = os.path.join(parentDir, newestDir)
    logging.info(‘Linking from %s to %s’ % (newestDir, options.latestname))
    os.mkdir(linkdst)
    for root, dirs, files in os.walk(lsrc):
        for name in dirs:
            realdir, linkpath = getSrcAndDstPaths(lsrc, linkdst, root, name)
            if os.path.islink(realdir):
                os.symlink(os.readlink(realdir), linkpath)
            else:
                clonedir(realdir, linkpath)
        for name in files:
            try:
                realfile, linkpath = getSrcAndDstPaths(lsrc, linkdst, root, name)
                os.link(realfile, linkpath)
            except OSError, errdata:
                logging.error(‘Error linking %s to %s: %s’ % (lsrc, linkdst, errdata))

def sync():
    """Run rsync"""
    logging.info(‘Making new backup’)
    dst = os.path.join(options.destdir, options.latestname)
    command = ‘rsync %s %s %s’ % (options.rsyncopts, src, dst)
    logging.info(‘rsync %s %s’ % (src, dst))
    logging.debug(command)
    getCommandOutput(command, logProgress, timeout=900000)
    os.utime(os.path.join(options.destdir, options.latestname), (now, now) )

def storeBackupTime():
    """If using a config file,
    store the timestamp for the current backup to it"
""
    if options.timestamps:
        tstampFile = options.timestamps
    elif options.config:
        tstampFile = options.config
    else:
        logging.debug(‘No timestamp file’)
        return
   
    config = ConfigParser()
    if os.path.exists(tstampFile):
        config.read(tstampFile)
    if not config.has_section(options.jobname):
        config.add_section(options.jobname)
    logging.debug(‘Save timestamp "%s" to %s’ % (now, tstampFile))
    config.set(options.jobname, ‘lastbackup’, now)
    config.write(open(tstampFile, ‘w’))
   
   
def parseOptions():
    """Options parsing"""
    opts = optparse.OptionParser(version=verString)
    reqgrp = optparse.OptionGroup(opts, ‘Required’)
    optgrp = optparse.OptionGroup(opts, ‘Optional’)
   
    # Required options
    reqgrp.add_option(‘-j’, ‘–jobname’, metavar=‘NAME’, help=‘Name of backup job’)
    reqgrp.add_option(‘-s’, ‘–src’, dest=‘srcdir’, metavar=‘DIR’, help=‘Directory to backup’)
    reqgrp.add_option(‘-c’, ‘–maxage’, type=‘int’, metavar=‘N’, help=‘Number of days to keep old backups for’)
    reqgrp.add_option(‘-d’, ‘–dest’, dest=‘destdir’, metavar=‘DIR’, help=‘Dir to place backups in ‘
                      ‘(will create a dir named after the job name)’)

    # Optional options
    optgrp.add_option(‘-m’, ‘–machine’, metavar=‘HOST’, help=‘Hostname for rsync to pull data from, ‘
                      ‘if omitted assumes localhost’)
    optgrp.add_option(‘-e’, ‘–excluded’, metavar=‘FILE’, help=‘File containing list of files to exclude from the backup’)
    optgrp.add_option(‘-l’, ‘–logdir’, metavar=‘DIR’, help=‘DIR to store logfiles in’)
    optgrp.add_option(‘-n’, ‘–no-act’, action=‘store_true’, dest=‘noact’, help=‘Show what would be done in the backup job’)
    optgrp.add_option(‘–rsyncopts’, metavar=‘OPTS’, help=‘Set rsync options, defaults to ‘
                      ‘"-aAxzSO –delete-excluded –delete-during –fake-super –numeric-ids"’)
    optgrp.add_option(‘–sshkey’, metavar=‘FILE’, help=‘Run ssh-agent and give it the key FILE’)
    optgrp.add_option(‘–logfile’, metavar=‘FILE’, help=‘Log to FILE’)
    optgrp.add_option(‘–latestname’, default=‘daily.0/’)
    optgrp.add_option(‘–period’, type=‘int’, help=‘Min time between backups’)
    optgrp.add_option(‘–config’, metavar=‘FILE’, help=‘FILE to read more options from’)
    optgrp.add_option(‘–timestamps’, metavar=‘FILE’, help=‘FILE to read/write backup timestamps from/to’)
    optgrp.add_option(‘–mount’, metavar=‘MOUNTPOINT’, help=‘Mount drive at MOUNTPOINT on demand’)
    optgrp.add_option(‘–custommount’, metavar=‘COMMAND’, help=‘Use a custom mount command’)
    optgrp.add_option(‘–customumount’, metavar=‘COMMAND’, help=‘Use a custom umount command’)
    optgrp.add_option(‘-q’, ‘–quiet’, action=‘store_const’, const=0, dest=‘verbose’)
    optgrp.add_option(‘-v’, ‘–verbose’, action=‘store_const’, const=2, dest=‘verbose’)
    optgrp.add_option(‘–debug’, action=‘store_const’, const=3, dest=‘verbose’)
   
    opts.add_option_group(reqgrp)
    opts.add_option_group(optgrp)
    opts.set_defaults(verbose=1)
   
    (parsedopts, dummy) = opts.parse_args()
    return parsedopts

def readSection(config, section, fileopts):
    """Load in the options from the named section of an ini file"""
    if config.has_section(section):
        #logging.debug(‘Reading config file %s, section %s’ %
        #              (fileopts.config, section))
        for opt in config.items(section):
            if hasattr(fileopts, opt[0]) and getattr(fileopts, opt[0]) != None:
                #logging.debug(‘Ignoring opt %s, already set to "%s"’ %
                #              (opt[0], getattr(fileopts, opt[0])))
                continue
            tempStr = opt[1]
            if tempStr.isdigit():
                tempVal = int(tempStr)
            else:
                tempVal = tempStr
            setattr(fileopts, opt[0], tempVal)
    return fileopts

def parseOptionsFile(cmdlineOpts):
    """Read in ini file and add options to any cmd line args already given"""
    jndict = {‘jobname’: cmdlineOpts.jobname}
    config = ConfigParser(jndict)
    config.read(cmdlineOpts.config)
    fileopts = readSection(config, ‘common’, cmdlineOpts)
    if cmdlineOpts.jobname:
        fileopts = readSection(config, cmdlineOpts.jobname, fileopts)
   
    return fileopts

def init():
    """Initialise the program"""
    setupLogging2()
   
    if not options.destdir:
        errormsg = ‘Error: No destination dir specified’
        logging.error(errormsg)
        sys.exit(2)
   
    if not options.srcdir:
        errormsg = ‘Error: No source dir specified’
        logging.error(errormsg)
        sys.exit(2)
       
    if options.jobname == None:
        errormsg = ‘Error: No name for backup job given’
        logging.error(errormsg)
        sys.exit(2)

    if options.machine:
        rsyncsrc = options.machine + ‘:’ + options.srcdir
    else:
        rsyncsrc = options.srcdir
    options.destdir = os.path.join(options.destdir, options.jobname)
    if options.verbose == 0:
        options.rsyncopts += ‘ -q’
    else:
        options.rsyncopts += ‘ -v’
    if options.excluded:
        options.rsyncopts += ‘ –exclude-from=’ + options.excluded
    return rsyncsrc

def setupLogging1():
    """Setup logging module.
    Log to terminal if available (usually only for debugging)
    "
""
    log = logging.getLogger()
    logformat = ‘%(asctime)s %(levelname)s %(message)s’
    dateformat = ‘%Y-%m-%d %H:%M:%S’
    frmttr = logging.Formatter(logformat, dateformat)
       
    if sys.stdin.isatty():
        shdlr = logging.StreamHandler(sys.stdout)
        shdlr.setFormatter(frmttr)
        shdlr.setLevel(logging.DEBUG)
        log.addHandler(shdlr)
    log.setLevel(logging.INFO)
    return log

def setupLogging2():
    """Add any user requested log destinations to the log already setup"""
    log = logging.getLogger()
   
    if options.verbose >= 3:
        log.setLevel(logging.DEBUG)
    elif options.verbose == 0:
        log.setLevel(logging.WARNING)
    else:
        log.setLevel(logging.INFO)

    logformat = ‘%(asctime)s %(levelname)-7s: %(jobname)s:  %(message)s’
    dateformat = ‘%Y-%m-%d %H:%M:%S’
    frmttr = CustomFormatter(logformat, dateformat)
   
    if options.logfile:
        if os.path.isabs(options.logfile):
            fhdlr = logging.FileHandler(options.logfile)
        else:
            if options.logdir:
                logpath = os.path.join(options.logdir, options.logfile)
            else:
                logpath = os.path.join(os.getcwd(), options.logfile)
            fhdlr = logging.FileHandler(logpath)
    elif options.logdir and options.jobname:
        logname = options.jobname + ‘.log’
        fhdlr = logging.FileHandler( os.path.join(options.logdir, logname) )
   
    if options.logfile or (options.logdir and options.jobname):
        fhdlr.setFormatter(frmttr)
        fhdlr.setLevel(logging.INFO)
        log.addHandler(fhdlr)
       
    return log

def checkLastTime():
    """If a lastbackup option has been found, use it to determine whether
    enough time has passed that another backup is required"
""
    config = ConfigParser()
    if options.timestamps and os.path.exists(options.timestamps):
        config.read(options.timestamps)
    else:
        config.read(options.config)
   
    if not config.has_section(options.jobname):
        return True
    if config.has_option(options.jobname, ‘lastbackup’):
        logging.debug(‘Reading timestamp from %s’ % options.timestamps)
        tstamp = config.getfloat(options.jobname, ‘lastbackup’)
        oDate = datetime.datetime.fromtimestamp(tstamp)
        nDate = datetime.datetime.fromtimestamp(now)
        logging.debug(‘Last backup at %s’ % oDate)
        logging.debug(‘Time is now    %s’ % nDate)
        timediff = nDate – oDate
        logging.debug(‘Age of last backup = %s’ % timediff)
        if timediff.days < options.period:
            logging.debug(‘Not long enough since last backup (%d < %d)’ %
                          (timediff.days, options.period))
            return False
        else:
            return True
    logging.debug(‘lastbackup timestamp not found’)
    return True

def main():
    """Actually do stuff"""
    try:
        sshagent(src)
        if options.machine:
            srcUp = pingrsync(options.machine, src)
        else:
            srcUp = True # Local machine, must be up
   
        dstUp = False
        if srcUp:
            if options.mount:
                dstUp = mount(options.mount)
            else:
                dstUp = True
           
        if srcUp and dstUp:
            if not os.path.exists(options.destdir):
                logging.warning(‘Backupdir "%s" doesn\’t exist’ % options.destdir)
                logging.info(‘Creating dir %s’ % options.destdir)
                os.mkdir(options.destdir)
            if not os.path.isdir(options.destdir):
                raise RuntimeError(‘Destdir "%s" isn\’t a dir’ % options.destdir)
            renameByAge()
            link(options.destdir)
            sync()
            storeBackupTime()
            exitCode = 0
        else:
            exitCode = 1
    except RuntimeError, err:
        logProgress(err.args[0], logging.ERROR)
        exitCode = 2
    except KeyboardInterrupt, err:
        logging.error(‘User interrupted program’)
        exitCode = 3
   
    # If we mounted the drive then unmount it
    if srcUp and dstUp and options.mount:
        umount(options.mount)
   
    sys.exit(exitCode)

###############################
# Main
###############################
if __name__ == ‘__main__’:
    setupLogging1()
   
    options = parseOptions()
    if options.config:
        options = parseOptionsFile(options)
    # Set unset options to default values
    if options.rsyncopts == None:
        options.rsyncopts = ‘-aAxzSO –delete-excluded –delete-during –fake-super –numeric-ids’
    if options.maxage == None:
        options.maxage = sys.maxint
    if options.noact:
        import fakeos as os
        import fakeshutil as shutil
        from fakegetCommandOutput import getCommandOutput
    else:
        import os
        import shutil
        from getCommandOutput import getCommandOutput
   
    src = init()
    #logging.debug(str(options))
    now = time.time()
    if checkLastTime():
        main()