#!/usr/bin/python3 -u # -*- mode: Python; coding: utf-8 -*- # Fix issues with missing OSTree objects # # Copyright (C) 2017 Endless Mobile, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """Fix issues with OSTree missing objects When OSTree repository objects have been inadvertently deleted, it can cause two types of problems (among others): 1. If the deleted object is part of a commit, then the commit is now partial, but OSTree doesn't know that unless a commitpartial file exists. Without that, it will assume the commit is fully intact and use it as the source for a static delta. 2. If the deleted object is a commit, then any references to it will be dangling. This will cause errors since OSTree assumes that a referenced commit will exist and will raise errors as soon as it tries to be used. This script attempts to address these 2 issues by repulling the commits for any dangling references and marking any commits with missing objects as partial. To guard against another program operating on the repository, all processes that have the repository open are killed. Some commands for testing this script when hacking on it (from a throwaway working system): ``` for objtype in commit dirtree dirmeta file; do sudo find /ostree/repo/objects -type f -name "*.${objtype}" -print -delete -quit done sudo eos-fix-ostree-repo sudo ostree fsck ``` Make sure to test on normal and split-disk systems. """ from argparse import ArgumentParser from fnmatch import fnmatch import gi gi.require_version('OSTree', '1.0') from gi.repository import GLib, Gio, OSTree import os import pwd import signal import stat import sys import time # Older OSTree versions had the GI annotation wrong, and the enum # value was OSTree.RepoCommitState.REPO_COMMIT_STATE_PARTIAL while # recent versions have OSTree.RepoCommitState.PARTIAL [1]. # # So we try to load OSTree.RepoCommitState.PARTIAL and, if that fails, # override OSTree.RepoCommitState.PARTIAL to be the old enum value. # # [1] See https://github.com/ostreedev/ostree/pull/1335 if 'PARTIAL' not in dir(OSTree.RepoCommitState): OSTree.RepoCommitState.PARTIAL = OSTree.RepoCommitState.REPO_COMMIT_STATE_PARTIAL # Prior to ostree-2017.1, the GI annotation for ostree_repo_list_objects # was wrong, making it unusable from bindings[1]. This is tricky to # detect since the version checking was not added until ostree-2017.3. # However, in the same commit, # ostree_repo_list_commit_objects_starting_with was fixed so that the # out_commits parameter was marked properly. # # Use the direction of this argument as a proxy to know when # ostree_repo_list_objects will work. Otherwise, make our own # list_objects implementation and monkey-patch it into the Repo class. # # 1. https://github.com/ostreedev/ostree/commit/300752e5 _func = OSTree.Repo.list_commit_objects_starting_with if _func.get_arguments()[1].get_direction() == gi._gi.Direction.IN: import glob # Fake ostree_repo_list_objects. See # https://github.com/ostreedev/ostree/blob/master/src/libostree/ostree-repo.c # for the real implementation. def _list_objects(self, flags, cancellable=None): objects = {} repo_path = self.get_path().get_path() repo_mode = self.get_mode() # Objects live in the objects directory split after the 2nd # character of their sha256sum. E.g., # 8d/2925839245dd91ac9fbfcc0e7a383cddf5145bd7c5bc5de0d46929a3fa5963.file. objdir_pattern = repo_path + '/objects/[a-f0-9][a-f0-9]' for objdir in glob.iglob(objdir_pattern): for entry in os.listdir(objdir): name, ext = os.path.splitext(entry) if len(name) != 62: # Not a partial sha256 continue if ext == '': continue elif (ext == '.filez' and repo_mode == OSTree.RepoMode.ARCHIVE_Z2): objtype = OSTree.ObjectType.FILE elif (ext == '.file' and repo_mode != OSTree.RepoMode.ARCHIVE_Z2): objtype = OSTree.ObjectType.FILE elif ext == '.dirtree': objtype = OSTree.ObjectType.DIR_TREE elif ext == '.dirmeta': objtype = OSTree.ObjectType.DIR_META elif ext == '.commit': objtype = OSTree.ObjectType.COMMIT else: continue # Insert the object. The key is the serialized object # name and the value is always the same (bas) variant (I # think packed objects were supposed to put something # here, but only loose objects ever exist). checksum = os.path.basename(objdir) + name key = OSTree.object_name_serialize(checksum, objtype) value = GLib.Variant.new_tuple( GLib.Variant('b', True), GLib.Variant('as', []) ) objects[key] = value return True, objects # Override the standard list_objects OSTree.Repo.list_objects = _list_objects def kill_repo_procs(repo_path, sig): """Kill all processes with repo open Walk /proc to find any process with the repo directory open and kill them with signal sig. """ print('Killing processes with', repo_path, 'open with signal', sig) self_pid = os.getpid() for pid in os.listdir('/proc'): if not pid.isnumeric(): continue if int(pid) == self_pid: continue # The process may have exited try: proc_fds = os.listdir(os.path.join('/proc', pid, 'fd')) except FileNotFoundError: continue for fd in proc_fds: # The process may have exited or the file may have been closed try: fd_path = os.readlink(os.path.join('/proc', pid, 'fd', fd)) except FileNotFoundError: continue # If the open file is the repo or a path within the repo, # kill the process if fd_path == repo_path or fd_path.startswith(repo_path + '/'): # Try to read the exe file for information, but in some # cases (kernel thread), it may not exist try: pid_exe = os.readlink(os.path.join('/proc', pid, 'exe')) except: pid_exe = '' # Kill it and go to the next process print('Killing pid', pid, pid_exe, 'with signal', sig) os.kill(int(pid), sig) break def pull_commit(repo, remote, checksum, full=False): """Pull commit from remote When full is False, only the commit metadata will be pulled. """ if full: flags = OSTree.RepoPullFlags.NONE else: flags = OSTree.RepoPullFlags.COMMIT_ONLY opts = GLib.Variant('a{sv}', { 'flags': GLib.Variant('i', flags), 'refs': GLib.Variant('as', (checksum,)), 'depth': GLib.Variant('i', 0), }) # FIXME: For some reason, pull_with_options cannot be stopped with # ^C from the keyboard (SIGINT). This could be a problem in ostree # or pygobject, but I suspect it has something to do with what pull # does with the main context. progress = OSTree.AsyncProgress.new() progress.connect('changed', OSTree.Repo.pull_default_console_progress_changed, None) repo.pull_with_options(remote, opts, progress) progress.finish() def fix_dangling_refs(repo): """Update repo refs where the commit is missing This does a commit metadata only pull so the refs are valid again. """ repo_path = os.path.realpath(repo.get_path().get_path()) print('Fixing refs pointing to missing commits in', repo_path) _, all_refs = repo.list_refs() for refspec, checksum in all_refs.items(): try: repo.load_commit(checksum) except GLib.Error as err: if not err.matches(Gio.io_error_quark(), Gio.IOErrorEnum.NOT_FOUND): raise # Try to pull the commit metadata again. _, remote, ref = OSTree.parse_refspec(refspec) if remote is None: # If there's no remote, assume it's an ostree ref and # use "eos" as the remote. print('No remote for ref', ref, 'assuming "eos"') remote = 'eos' print('Pulling', checksum, 'commit metadata from', remote, 'for', ref) pull_commit(repo, remote, checksum) def mark_commits_partial(repo): """Mark commits with missing objects as partial""" repo_path = os.path.realpath(repo.get_path().get_path()) print('Marking commits with missing objects as partial in', repo_path) _, all_objects = repo.list_objects(OSTree.RepoListObjectsFlags.ALL, None) for objname in all_objects: checksum, objtype = OSTree.object_name_deserialize(objname) if objtype != OSTree.ObjectType.COMMIT: continue _, commit, state = repo.load_commit(checksum) if state == OSTree.RepoCommitState.PARTIAL: print('Commit', checksum, 'already marked as partial') continue mark_partial = False try: # If a dirtree is missing, traverse_commit will fail with # G_IO_ERROR_NOT_FOUND. _, reachable_objects = repo.traverse_commit(checksum, 0) # Unfortunately, it doesn't check that the leaves (dirmeta # and files) exist, so we need to do that manually. In case # that behavior ever changes, just check that all the # reachable objects exist. # # https://github.com/ostreedev/ostree/issues/1222 for commit_obj in reachable_objects: if commit_obj not in all_objects: mark_partial = True break except GLib.Error as err: if not err.matches(Gio.io_error_quark(), Gio.IOErrorEnum.NOT_FOUND): raise mark_partial = True if mark_partial: print('Marking commit', checksum, 'as partial') commit_partial_path = os.path.join(repo_path, 'state', checksum + '.commitpartial') with open(commit_partial_path, 'w'): pass def pull_partial_refs(repo): """Try to fully restore any partial referenced commits""" # Look for any partial refs and re-pull them. _, all_refs = repo.list_refs() for refspec, checksum in all_refs.items(): _, remote, ref = OSTree.parse_refspec(refspec) if remote is None: # Don't bother pulling local refs. Only the ostree deploys # are local, and as long as they're marked partial, they can # be updated later. continue # If this is an app or runtime locale, it's intentionally # partial since only the relevant subpaths are pulled. Skip it # to not use up extra bandwidth and disk space. if fnmatch(ref, '*/*.Locale/*/*'): print('Skipping intentionally partial Locale commit', refspec, checksum) continue _, commit, state = repo.load_commit(checksum) if state != OSTree.RepoCommitState.PARTIAL: continue # Try to pull the full commit again. print('Pulling', checksum, 'commit from', remote, 'for', ref) pull_commit(repo, remote, checksum, full=True) def main(): aparser = ArgumentParser( description='Fix broken OSTree repo' ) path_group = aparser.add_mutually_exclusive_group() path_group.add_argument('--sysroot', help='path to OSTree sysroot') path_group.add_argument('--repo', help='path to OSTree repo') args = aparser.parse_args() if os.geteuid() != 0: print('Must be root to run', sys.argv[0], file=sys.stderr) sys.exit(1) print('WARNING: Do not start App Center while this is running') if args.repo is not None: # Use a repo directly instead of getting it from the sysroot sysroot = None repo_file = Gio.File.new_for_path(args.repo) repo = OSTree.Repo.new(repo_file) repo.open() else: # Get the repo from the sysroot if args.sysroot is None: sysroot_file = None else: sysroot_file = Gio.File.new_for_path(args.sysroot) sysroot = OSTree.Sysroot.new(sysroot_file) sysroot.load() _, repo = sysroot.get_repo() # Resolve the full repo path repo_path = os.path.realpath(repo.get_path().get_path()) # Must be running as the owner of the repo. We don't want to make # root owned files in a non-root owned repo. repo_uid = os.stat(repo_path).st_uid if os.geteuid() != repo_uid: # Try to get the repo owner's name try: repo_user = pwd.getpwuid(repo_uid).pw_name except: repo_user = repo_uid print(repo_path, 'is owned by', repo_user, 'not root', file=sys.stderr) sys.exit(1) # Kill once with SIGTERM, then with SIGKILL kill_repo_procs(repo_path, signal.SIGTERM) time.sleep(1) kill_repo_procs(repo_path, signal.SIGKILL) # Now lock the sysroot if one is in use if sysroot is not None and not sysroot.try_lock(): print('Could not lock sysroot', sysroot.get_path().get_path(), file=sys.stderr) sys.exit(1) # In older OSTree, cleaning up after a transaction (e.g., a pull) # would delete the tmp/cache directory if it was older than 1 day. # That's a problem because it has an open fd for that directory. # Update the directory's mtime to current. This is racy because # other repo users may have deleted the directory after we opened # the repo and before they were killed, so just fail if the # directory doesn't exist. cache_dir = os.path.join(repo_path, 'tmp', 'cache') try: os.utime(cache_dir) except FileNotFoundError: print(cache_dir, 'does not exist - run', sys.argv[0], 'again!', file=sys.stderr) sys.exit(1) # First, fix dangling refs so that refs can be reliably listed again fix_dangling_refs(repo) # Next, traverse all commits to mark any as partial mark_commits_partial(repo) # Finally, try to completely pull in any partial referenced commits # so there are no longer any missing objects pull_partial_refs(repo) print('\nSuccess! Try to update the OS and Apps now.') if __name__ == '__main__': main()