summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJan Wolff <janw@mailbox.org>2019-06-21 09:35:42 +0200
committerJan Wolff <janw@mailbox.org>2019-06-21 09:35:42 +0200
commit940fe79cafbf9e8d262af3bc0db56269c2bf74ec (patch)
tree9193d0c9c6392376431f27a85d9b0a991b749f3b
initial commit
-rw-r--r--LICENSE.md18
-rw-r--r--README.md15
-rwxr-xr-xvma.py303
3 files changed, 336 insertions, 0 deletions
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..6ed8478
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,18 @@
+Copyright (c) 2019 Jan Wolff
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..962f5d5
--- /dev/null
+++ b/README.md
@@ -0,0 +1,15 @@
+VMA extractor
+=============
+
+`vma.py` implements a VMA extraction tool in Python 3.
+
+Usage:
+```sh
+./vma.py path/to/source.vma path/to/target/directory
+```
+
+I think it is pretty important to be able to read Proxmox backups outside of a
+Proxmox environment. Yet, porting their VMA implementation to a standalone
+tool proved difficult. VMA-Reader and VMA-Writer are implemented as patches to
+the Proxmox-patched version and Qemu and are thus very difficult to compile on
+non-Proxmox systems.
diff --git a/vma.py b/vma.py
new file mode 100755
index 0000000..820323c
--- /dev/null
+++ b/vma.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env python3
+import os
+import sys
+import hashlib
+import struct
+import argparse
+
+
+class VmaHeader():
+ def __init__(self, fo):
+ # 0 - 3: magic
+ # VMA magic string ("VMA\x00")
+ magic = fo.read(4)
+ assert magic == b'VMA\0'
+
+ # 4 - 7: version
+ # Version number (valid value is 1)
+ version = int.from_bytes(fo.read(4), 'big')
+ assert version == 1
+
+ # 8 - 23: uuid
+ # Unique ID, Same uuid is used to mark extents.
+ self.uuid = fo.read(16)
+
+ # 24 - 31: ctime
+ # Backup time stamp (seconds since epoch)
+ self.ctime = int.from_bytes(fo.read(8), 'big')
+
+ # 32 - 47: md5sum
+ # Header checksum (from byte 0 to header_size). This field
+ # is filled with zero to generate the checksum.
+ self.md5sum = fo.read(16)
+
+ # 48 - 51: blob_buffer_offset
+ # Start of blob buffer (multiple of 512)
+ self.blob_buffer_offset = int.from_bytes(fo.read(4), 'big')
+
+ # 52 - 55: blob_buffer_size
+ # Size of blob buffer (multiple of 512)
+ self.blob_buffer_size = int.from_bytes(fo.read(4), 'big')
+
+ # 56 - 59: header_size
+ # Overall size of this header (multiple of 512)
+ self.header_size = int.from_bytes(fo.read(4), 'big')
+
+ # 60 - 2043: reserved
+ fo.seek(1984, os.SEEK_CUR)
+
+ # 2044 - 3067: uint32_t config_names[256]
+ # Offsets into blob_buffer table
+ self.config_names = []
+ for i in range(256):
+ self.config_names.append(int.from_bytes(fo.read(4), 'big'))
+
+ # 3068 - 4091: uint32_t config_data[256]
+ # Offsets into blob_buffer table
+ self.config_data = []
+ for i in range(256):
+ self.config_data.append(int.from_bytes(fo.read(4), 'big'))
+
+ # 4092 - 4095: reserved
+ fo.seek(4, os.SEEK_CUR)
+
+ # 4096 - 12287: VmaDeviceInfoHeader dev_info[256]
+ # The offset in this table is used as 'dev_id' inside
+ # the data streams.
+ self.dev_info = []
+ for i in range(256):
+ self.dev_info.append(VmaDeviceInfoHeader(fo, self))
+
+ # 12288 - header_size: Blob buffer
+
+ # the blob buffer layout is very odd. there appears to be an additional
+ # byte of padding at the beginning
+ fo.seek(1, os.SEEK_CUR)
+ # since byte-wise offsets are used to address the blob buffer, the
+ # blob metadata is stored in a hashmap, with the offsets as the keys
+ self.blob_buffer = {}
+ blob_buffer_current_offset = 1
+ while(fo.tell() < self.blob_buffer_offset + self.blob_buffer_size):
+ self.blob_buffer[blob_buffer_current_offset] = Blob(fo)
+ blob_buffer_current_offset = fo.tell() - self.blob_buffer_offset
+
+ # make sure the file object points at the end of the vma header
+ fo.seek(self.header_size, os.SEEK_SET)
+
+
+class VmaDeviceInfoHeader():
+ def __init__(self, fo, vma_header):
+ self.__vma_header = vma_header
+
+ # 0 - 3: devive name (offsets into blob_buffer table)
+ self.device_name = int.from_bytes(fo.read(4), 'big')
+
+ # 4 - 7: reserved
+ fo.seek(4, os.SEEK_CUR)
+
+ # 8 - 15: device size in bytes
+ self.device_size = int.from_bytes(fo.read(8), 'big')
+
+ # 16 - 31: reserved
+ fo.seek(16, os.SEEK_CUR)
+
+
+ def get_name(self):
+ name = self.__vma_header.blob_buffer[self.device_name].data
+ return name.split(b'\0')[0].decode('utf-8')
+
+
+class VmaExtentHeader():
+ def __init__(self, fo, vma_header):
+ # 0 - 3: magic
+ # VMA extent magic string ("VMAE")
+ magic = fo.read(4)
+ assert magic == b'VMAE'
+
+ # 4 - 5: reserved
+ fo.seek(2, os.SEEK_CUR)
+
+ # 6 - 7: block_count
+ # Overall number of contained 4K block
+ self.block_count = int.from_bytes(fo.read(2), 'big')
+
+ # 8 - 23: uuid
+ # Unique ID, Same uuid as used in the VMA header.
+ self.uuid = fo.read(16)
+
+ # 24 - 39: md5sum
+ # Header checksum (from byte 0 to header_size). This field
+ # is filled with zero to generate the checksum.
+ self.md5sum = fo.read(16)
+
+ # 40 - 511: blockinfo[59]
+ self.blockinfo = []
+ for i in range(59):
+ self.blockinfo.append(Blockinfo(fo, vma_header))
+
+
+class Blob():
+ def __init__(self, fo):
+ # the size of a blob is a two-byte int in LITTLE endian
+ # source: original c code of vma-reader
+ # uint32_t size = vmar->head_data[bstart] +
+ # (vmar->head_data[bstart+1] << 8);
+ self.size = int.from_bytes(fo.read(2), 'little')
+ self.data = fo.read(self.size)
+
+
+class Blockinfo():
+ CLUSTER_SIZE = 65536
+
+ def __init__(self, fo, vma_header):
+ self.__vma_header = vma_header
+
+ # 0 - 1: mask
+ self.mask = int.from_bytes(fo.read(2), 'big')
+
+ # 2: reserved
+ fo.seek(1, os.SEEK_CUR)
+
+ # 3: dev_id
+ # Device ID (offset into dev_info table)
+ self.dev_id = int.from_bytes(fo.read(1), 'big')
+
+ # 4 - 7: cluster_num
+ self.cluster_num = int.from_bytes(fo.read(4), 'big')
+
+
+def extract_configs(fo, args, vma_header):
+ """
+ Configs in VMA are composed of two blobs. One specifies the config's
+ filename and the other contains the config's content.
+ The filename seems to be a null-terminated string, while the content is not
+ terminated.
+ """
+
+ if args.verbose: print('extracting configs...')
+
+ for i in range(256):
+ if vma_header.config_names[i] == 0: continue
+ config_name = vma_header.blob_buffer[vma_header.config_names[i]].data
+ # interpret filename as a null-terminated utf-8 string
+ config_name = config_name.split(b'\0')[0].decode('utf-8')
+
+ if args.verbose: print(f'{config_name}...', end='')
+
+ config_data = vma_header.blob_buffer[vma_header.config_data[i]].data
+
+ with open(os.path.join(args.destination, config_name), 'wb') as config_fo:
+ config_fo.write(config_data)
+
+ if args.verbose: print(' OK')
+
+
+def extract(fo, args):
+ os.makedirs(args.destination, exist_ok=True)
+
+ fo.seek(0, os.SEEK_END)
+ filesize = fo.tell()
+ fo.seek(0, os.SEEK_SET)
+
+ vma_header = VmaHeader(fo)
+
+ extract_configs(fo, args, vma_header)
+
+ # extract_configs may move the read head somewhere into the blob buffer
+ # make sure we are back at the end of the header
+ fo.seek(vma_header.header_size, os.SEEK_SET)
+
+ if args.verbose: print('extracting devices...')
+
+ # open file handlers for all devices within the VMA
+ # so we can easily append data to arbitrary devices
+ device_fos = {}
+ for dev_id, dev_info in enumerate(vma_header.dev_info):
+ if dev_info.device_size > 0:
+ if args.verbose: print(dev_info.get_name())
+ device_fos[dev_id] = open(os.path.join(args.destination, dev_info.get_name()), 'wb')
+
+ if args.verbose: print('this may take a while...')
+
+ # used for sanity checking
+ cluster_num_prev = -1
+
+ while(fo.tell() < filesize):
+ # when there is data to read at this point, we can safely expect a full
+ # extent header with additional clusters
+ extent_header = VmaExtentHeader(fo, vma_header)
+ assert vma_header.uuid == extent_header.uuid
+
+ for blockinfo in extent_header.blockinfo:
+ if blockinfo.dev_id == 0: continue
+
+ device_fo = device_fos[blockinfo.dev_id]
+
+ # non-sequential clusters encountered, handle this case
+ if blockinfo.cluster_num != cluster_num_prev + 1:
+ if args.verbose: print('non sequential cluster encountered...')
+
+ cluster_pos = blockinfo.cluster_num * Blockinfo.CLUSTER_SIZE
+ if blockinfo.cluster_num > cluster_num_prev:
+ # special case: cluster num is larger than current,
+ # seek forward into file AND, if needed, fill missing size
+ # with zeros
+ device_fo.seek(0, os.SEEK_END)
+ written_size = device_fo.tell()
+
+ if written_size < cluster_pos:
+ # add padding for missing clusters
+ if args.verbose:
+ print(f'{blockinfo.cluster_num}')
+ print(f'adding {cluster_pos - written_size} bytes'
+ + 'of padding...')
+
+ # write padding in chucks of 4096 bytes to avoid
+ # memory errors
+ padding = cluster_pos - written_size
+ while padding > 0:
+ device_fo.write(b'\0' * min(padding, 4096))
+ padding -= 4096
+
+ # seek to start of new cluster
+ device_fo.seek(cluster_pos, os.SEEK_SET)
+
+ cluster_num_prev = blockinfo.cluster_num
+
+ for i in range(16):
+ # a 2-bytes wide bitmask indicates 4k blocks with only zeros
+ if (1 << i) & blockinfo.mask:
+ device_fo.write(fo.read(4096))
+ else:
+ device_fo.write(b'\0' * 4096)
+
+ if args.verbose: print('closing file handles...')
+ for device_fo in device_fos.values():
+ device_fo.close()
+
+ if args.verbose: print('done')
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('filename', type=str)
+ parser.add_argument('destination', type=str)
+ parser.add_argument('-v', '--verbose', default=False, action='store_true')
+ parser.add_argument('-f', '--force', default=False, action='store_true')
+ args = parser.parse_args()
+
+ if(not os.path.exists(args.filename)):
+ print('Error! Source file does not exist!')
+ return 1
+
+ if(os.path.exists(args.destination) and not args.force):
+ print('Error! Destination path exists!')
+ return 1
+
+ with open(args.filename, 'rb') as fo:
+ extract(fo, args)
+
+ return 0
+
+if __name__ == '__main__':
+ sys.exit(main())