Skip to content

Commit

Permalink
add support for persistent checkpoints
Browse files Browse the repository at this point in the history
  • Loading branch information
abbbi committed May 10, 2021
2 parents 6d3f919 24efc95 commit ebd201d
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 7 deletions.
60 changes: 60 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 239,66 @@ the NBD server. The behavior can be changed by using option `-q` to use common
qemu tools (qemu-img map ..). By default `virtnbdbackup` uses a custom
implemented extent handler.

# Transient virtual machines: checkpoint persistency

In case virtual machins are started in transient environments, such as using
cluster solutions like `pacemaker` situations can appear where the checkpoints
for the virtual machine defined by libvirt are not in sync with the bitmap
information in the qcow files.

In case libvirt creates an checkpoint, the checkpoint information is stored
in two places:

* var/lib/libvirt/qemu/checkpoint/<domain_name>
* In the bitmap file of the virtual machines qcow image.

Depending on the cluster solution, in case virtual machines are destroyed
on host A and are re-defined on host B, libvirt loses the information about
those checkpoints. Unfortunately `libvirtd` scans the checkpoint only once
during startup.

This can result in an situation, where the bitmap is still defined in the
qcow image, but libvirt doesnt know about the checkpoint, backup then
fails with:

`Unable to execute QEMU command 'transaction': Bitmap already exists`

By default `virtnbdbackup` attempts to store the checkpoint information in the
default backup directory, in situations where it detects an situation here the
checkpoint is missing, it attempts to redefine them from the prior backups.

In order to store the checkpoint information at some central place the option
`--checkpointdir` can be used, this allows to have persistent checkpoints
accross multiple nodes:

As example:

1) Create Virtual machine Backup on Host A, store checkpoints in shared
directory between hosts `/mnt/shared/vm5`:

`virtnbdbackup -d vm5 -l full -o /tmp/backup --checkpointdir /mnt/shared/vm5`

2) After backup the virtual machine is relocated to Host B, and lost its
information about checkpoints and bitmaps, thus, the next full backup
fails with:

```
virtnbdbackup -d vm1 -l full -o /tmp/backup_hostb
[..]
unable to execute QEMU command 'transaction': Bitmap already exists: virtnbdbackup.0
```

3) One can now pass the checkpoint dir and files written from host A, and
virtnbdbackup will redefine missing checkpoints and execute a new full
backup:

```
virtnbdbackup -d vm1 -l full -o /tmp/backup_hostb --checkpointdir /mnt/shared/vm5
[..]
redefineCheckpoints: Redefine missing checkpoint virtnbdbackup.0
[..]
```

# FAQ
## The thin provisioned backups are bigger than the original qcow images

Expand Down
80 changes: 79 additions & 1 deletion libvirtnbdbackup/libvirthelper/libvirthelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 19,8 @@
import random
import logging
import libvirt
import glob
import os
from xml.etree import ElementTree

# this is required so libvirt.py does not report errors to stderr
Expand Down Expand Up @@ -238,10 240,21 @@ def checkpointExists(self, domObj, checkpointName):
"""
return domObj.checkpointLookupByName(checkpointName)

def removeAllCheckpoints(self, domObj, checkpointList):
def removeAllCheckpoints(self, domObj, checkpointList, args):
""" Remove all existing checkpoints for a virtual machine,
used during FULL backup to reset checkpoint chain
"""

# clean persistent storage in args.checkpointdir
logging.debug('Cleaning up persistent storage {:s}' . format(args.checkpointdir))
try:
for checkpointFile in glob.glob('{:s}/*.xml' . format(args.checkpointdir)):
logging.debug('Remove checkpoint file {:s}' . format(checkpointFile))
os.remove(checkpointFile)
except Exception as e:
logging.error('Unable to clean persistent storage {:s}: {}' . format(args.checkpointdir, e))
sys.exit(1)

if checkpointList is None:
cpts = domObj.listAllCheckpoints()
if cpts:
Expand All @@ -260,3 273,68 @@ def stopBackup(self, domObj):
""" Cancel the backup task using job abort
"""
return domObj.abortJob()

def redefineCheckpoints(self, domObj, args):
""" Redefine checkpoints from persistent storage
"""
# get list of all .xml files in checkpointdir
logging.info('Loading checkpoint list from: {:s}' . format(args.checkpointdir))
try:
l = glob.glob('{:s}/*.xml' . format(args.checkpointdir))
except Exception as e:
logging.error('Unable to get checkpoint list from {:s}: {}' . format(args.checkpointdir, e))
return False

for checkpointFile in sorted(l):
logging.debug('Loading checkpoint config from: {:s}' . format(checkpointFile))
try:
with open(checkpointFile, 'r') as f:
checkpointConfig = f.read()
root = ElementTree.fromstring(checkpointConfig)
except Exception as e:
logging.error('Unable to load checkpoint config from {:s}: {}' . format(checkpointFile, e))
return False

try:
checkpointName = root.find('name').text
except Exception as e:
logging.error('Unable to find checkpoint name: {}' . format(e))
return False

try:
c = domObj.checkpointLookupByName(checkpointName)
logging.debug('Checkpoint {:s} found' . format(checkpointName))
continue
except libvirt.libvirtError as e:
# ignore VIR_ERR_NO_DOMAIN_CHECKPOINT, report other errors
if e.get_error_code() != libvirt.VIR_ERR_NO_DOMAIN_CHECKPOINT:
logging.error('libvirt error: {}' . format(e))
return False

logging.info('Redefine missing checkpoint {:s}' . format(checkpointName))
try:
domObj.checkpointCreateXML(checkpointConfig, libvirt.VIR_DOMAIN_CHECKPOINT_CREATE_REDEFINE)
except Exception as e:
logging.error('Unable to redefine checkpoint {:s}: {}' . format(checkpointName, e))
return False

return True

def backupCheckpoint(self, domObj, args, checkpointName):
"""save checkpoint config to persistent storage"""
checkpointFile = '{:s}/{:s}.xml' . format(
args.checkpointdir,
checkpointName
)
logging.info('Saving checkpoint config to {:s}' . format(checkpointFile))
try:
with open(checkpointFile, 'w') as f:
c = domObj.checkpointLookupByName(checkpointName)
f.write(c.getXMLDesc())
return True
except Exception as e:
logging.error('Unable to save checkpoint config to file {:s}: {}'.format(
checkpointFile,
e
))
return False
2 changes: 1 addition & 1 deletion t/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 15,7 @@ all: | $(bats) vm1.tests vm2.tests vm3.tests vm4.tests
clean:
@rm -rf $(bats)
@rm -rf /tmp/testset*
for vm in vm1 vm2 vm3; do \
for vm in vm1 vm2 vm3 vm4 vm5; do \
virsh destroy $$vm ; \
virsh undefine $$vm --remove-all-storage --checkpoints-metadata ; \
done
Expand Down
Binary file modified t/vm5/vm5-sda.qcow2
Binary file not shown.
33 changes: 28 additions & 5 deletions virtnbdbackup
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 59,9 @@ def main():
parser.add_argument(
"-o", "--output", required=True, type=str,
help="Output target directory")
parser.add_argument(
'-C', '--checkpointdir', required=False, default=None, type=str,
help='Persistent libvirt checkpoint storage directory')
parser.add_argument(
"-S", "--scratchdir", default="/var/tmp", required=False, type=str,
help="Target directory for temporary scratch file")
Expand Down Expand Up @@ -145,6 148,17 @@ def main():
logging.error("Target directory must empty for full or copy backup.")
sys.exit(1)

if not args.checkpointdir:
args.checkpointdir = "{}/checkpoints".format(
args.output,
)
logging.info("Store checkpoints in: %s",
args.checkpointdir
)
lib.createOutputDir(args.checkpointdir)
else:
lib.createOutputDir(args.checkpointdir)

virtClient = libvirthelper.client()
try:
domObj = virtClient.getDomain(args.domain)
Expand Down Expand Up @@ -184,28 198,33 @@ def main():
logging.warn('%s', e)
sys.exit(1)

checkpointName = lib.checkpointName
checkpointName = '{:s}.0' . format(lib.checkpointName)
parentCheckpoint = False
checkpoints = []
cptFile = '%s/%s.cpt' % (args.output, args.domain)
if os.path.exists(cptFile):
with open(cptFile,'r') as cptFh:
checkpoints = json.loads(cptFh.read())

logging.info("Attempting to redefine checkpoints from: %s",
args.checkpointdir)
if virtClient.redefineCheckpoints(domObj, args) is False:
logging.warning("Unable to redefine checkpoints")

if args.level != "copy":
logging.info('Looking for checkpoints')
if args.level == "full" and checkpoints:
logging.info("Removing all existant checkpoints before full backup")
virtClient.removeAllCheckpoints(domObj, checkpoints)
virtClient.removeAllCheckpoints(domObj, checkpoints, args)
os.remove(cptFile)
checkpoints = []
elif args.level == "full" and len(checkpoints) < 1:
virtClient.removeAllCheckpoints(domObj,None)
virtClient.removeAllCheckpoints(domObj, None, args)
checkpoints = []

if checkpoints and args.level == "inc":
nextCpt = len(checkpoints) 1
checkpointName = "%s.%s" % (checkpointName, nextCpt)
nextCpt = len(checkpoints)
checkpointName = "%s.%s" % (lib.checkpointName, nextCpt)
if args.checkpoint != False:
logging.info("Overriding parent checkpoint: %s", args.checkpoint)
parentCheckpoint = args.checkpoint
Expand Down Expand Up @@ -245,6 264,10 @@ def main():
checkpoints.append(checkpointName)
with open(cptFile,'w') as cFw:
cFw.write(json.dumps(checkpoints))
if args.printonly is False and args.output != "-":
if not virtClient.backupCheckpoint(domObj, args, checkpointName):
virtClient.stopBackup(domObj)
sys.exit(1)

if args.startonly is True:
logging.info("Started backup job for debugging, exiting.")
Expand Down

0 comments on commit ebd201d

Please sign in to comment.