/* -*- mode: c; c-basic-offset: 8; -*-
 * vim:shiftwidth=8:tabstop=8:
 *
 * GPL HEADER START
 *
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 only,
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License version 2 for more details (a copy is included
 * in the LICENSE file that accompanied this code).
 *
 * You should have received a copy of the GNU General Public License
 * version 2 along with this program; If not, see
 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
 *
 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
 * CA 95054 USA or visit www.sun.com if you need additional information or
 * have any questions.
 *
 * GPL HEADER END
 */
/*
 * Copyright (c) 2004  Hewlett-Packard Co.
 * Copyright 2004, 2010, Oracle and/or its affiliates. All rights reserved.
 * Use is subject to license terms.
 * Copyright (c) 2011  Whamcloud, Inc.
 */
/***********************************************************************
 * This program takes as input files generated by running e2fsck on
 * the mds and ost filesystems. The file generated for each ost
 * contains a table including the object id and size for each object
 * extant on the ost in each entry.
 * The file generated from scanning the mds filesystem with e2fsck
 * contains multiple tables one for each ost. In each table an entry
 * contains the mds fid as well as the object id on the appropriate
 * ost. In addition there is an additional table that holds the mds_fid
 * and the containing directory fid for each entry. This is used for
 * name lookup.
 * There are three basic checks
 * 1) Make sure that multiple mds entries do not reference the same object
 * 2) Cross reference each object on each ost to make sure a "containing"
 *    file for this exists on the mds
 * 3) For each file on the mds make sure that the associated objects exist
 *    on the osts
 * These checks and potential correction for errors found are run from
 * run_pass*
 * Each of these checks is just iterate through the appropriate table and
 * cross check against another table and if errors are found repair.
 ***************************************************************************/
#define _GNU_SOURCE
#include <pthread.h>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <time.h>
#include <errno.h>
#include <limits.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <utime.h>
#include <unistd.h>
#include <sys/mman.h>
#include <getopt.h>
#include <mntent.h>
#include <dirent.h>
#ifdef HAVE_SYS_IOCTL_H
#include <sys/ioctl.h>
#endif
#include "../version.h"
#include "e2fsck.h"
#include "ext2fs/lfsck.h"
#include "ext2fs/ext2fs.h"

struct lfsck_fids {
	int depth;
	struct lu_fid *fids;
};

struct lfsck_thread_info {
	struct lfsck_mds_hdr *mds_hdr;
	DB *mds_direntdb;
	DB *mds_sizeinfodb;
	__u32 start_ost_idx;
	__u32 end_ost_idx;
	int status;
};

struct lfsck_saved_duplicates {
	struct lu_fid	ld_mds_fid;
	struct ost_id	ld_oi;
	__u32		ld_ost_idx;
	char		*ld_link;
};

#ifndef O_LARGEFILE
#define O_LARGEFILE 0
#endif

#define LOG_PATH "/var/log/lfsck.log"
#define RLIMIT 1024

/* Procedure declarations */

char *progname = "lfsck";
FILE *logfile;

int lfsck_help;
int lfsck_save;
int lfsck_delete;
int lfsck_create;
int lfsck_force;
int lfsck_verbose;
int lfsck_yes;

int num_threads = 1;

char mnt_path[PATH_MAX];
char *mds_file;
char lostandfounddir[PATH_MAX];
char dupedir[PATH_MAX];
char *ost_files[LOV_MAX_OSTS];
int num_ost_files;

struct obd_uuid lfsck_uuid[LOV_MAX_OSTS];
int lov_tgt_count = LOV_MAX_OSTS;

struct lfsck_saved_duplicates *lfsck_duplicates;
int lfsck_dup_saved;
int fixed;
int fix_failed;

pthread_cond_t init_cond = PTHREAD_COND_INITIALIZER;
pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t phase_lock = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t size_lock = PTHREAD_MUTEX_INITIALIZER;
int all_started;

int lfsck_path2fid(const char *path, struct lu_fid *fid)
{
       struct stat st;
       long generation;
       int fd;
       int rc = 0;

#ifdef LL_IOC_PATH2FID
       rc = llapi_path2fid(path, fid);
       if (rc == 0 || rc != -ENOTTY)
               return rc;
#endif
       /* LL_IOC_PATH2FID was landed in 1.8.2.  If it doesn't exist at compile
        * time, or it fails at runtime with a return ENOTTY indicating that
        * the ioctl is unimplemented, emulate it here for the older clients.
        * Assume the server is running Lustre 1.x and create an IGIF FID,
        * since < 1.8.4 will not work properly with 2.x servers anyway. */
       fd = open(path, O_RDONLY);
       if (fd < 0)
               return -errno;

       rc = fstat(fd, &st);
       if (rc < 0) {
               rc = -errno;
               goto out;
       }
       fid->f_seq = st.st_ino;

       rc = ioctl(fd, FSFILT_IOC_GETVERSION, &generation);
       if (rc < 0) {
               rc = -errno;
               goto out;
       }
       fid->f_oid = generation;
       fid->f_ver = 0;

out:
       close(fd);
       return rc;
}

#define VERBOSE(lvl, fmt, args...)					\
do { if (lfsck_verbose >= lvl) printf(fmt, ## args); } while (0)

int log_open()
{
	time_t tm;

	if ((logfile = fopen(LOG_PATH, "a")) == NULL) {
		fprintf(stderr, "%s: Failed to open log file %s\n",
			progname, LOG_PATH);
		return (-EPERM);
	}

	time(&tm);
	fprintf(logfile, "===============================================\n\n");
	fprintf(logfile, "Starting lfsck %s\n", ctime(&tm));
	return(0);
}

int log_close(int status)
{
	time_t tm;

	if (logfile == NULL)
		return(0);

	time(&tm);
	if (status < 0) {
		fprintf(logfile, "ERROR: lfsck aborted\n");
	} else {
		fprintf(logfile, "lfsck run completed:  %s\n",ctime(&tm));
	}
	fprintf(logfile, "===============================================\n\n");

	fclose(logfile);
	return(0);
}

void log_write(char *fmt, ...)
{
	va_list args;

	if (logfile) {
		va_start(args, fmt);
		vfprintf(logfile, fmt, args);
		va_end(args);
	}
	va_start(args, fmt);
	vfprintf(stderr, fmt, args);
	va_end(args);
}

void usage()
{
	printf("\n");
	printf("usage: lfsck [-cdfhlnv] --mdsdb mdsdb "
	       "--ostdb ostdb1 [ostdb2 ...] filesystem\n\n");
	printf("\t-m|--mdsdb mdsdb  MDS database from e2fsck --mdsdb\n");
	printf("\t-o|--ostdb ostdb  OST databases from e2fsck --ostdb\n");
	printf("\tfilesytem         full path of lustre mountpoint\n");
	printf("\t[-c|--create]     create missing objects\n");
	printf("\t[-d|--delete]     delete orphan objects\n");
	printf("\t[-f|--force]      force running if fs appears unmounted\n");
	printf("\t[-h|--help]       print this message\n");
	printf("\t[-l|--lostfound]  save orphans objects to lost+found\n");
	printf("\t[-n|--nofix]      do not fix filesystem errors (default)\n");
	printf("\t[-v|--verbose]    print verbose runtime messages\n");
	//printf("\t[-y|--yes]        do all cleanup automatically\n");
	printf("\n");
}

/*
 * Get the lov config for the filesystem - this is primarily used to correlate
 * each ost db file with its index in the lov configuration. Obviously this is
 * critical.
 */
int get_lov_config()
{
	int fd, rc;

	fd = open(mnt_path, O_RDONLY);
	if (fd < 0) {
		log_write("Error: opening %s\n", mnt_path);
		return (-errno);
	}

	rc = llapi_lov_get_uuids(fd, lfsck_uuid, &lov_tgt_count);

	close(fd);
	return rc;
}

int parse_args(int argc, char *argv[])
{
	int option_index = 0;
	char *path_name = NULL;
	struct option long_options[] = {
		{ "create", 1, NULL, 'c' },
		{ "delete", 0, NULL, 'd' },
		{ "force", 0, NULL, 'f' },
		{ "help", 0, NULL, 'h' },
		{ "lostfound", 0, NULL, 'l' },
		{ "mdsdb", 1, NULL, 'm' },
		{ "mdtdb", 1, NULL, 'm' },
		{ "nofix", 0, NULL, 'n' },
		{ "ostdb", 1, NULL, 'o' },
		{ "threads", 1, NULL, 't' },
		{ "verbose", 0, NULL, 'v' },
		//{ "yes", 0, NULL, 'y' },
		{ 0, 0, 0, 0 }
	};
	struct mntent *mnt_info;
	char tmp[PATH_MAX];
	char *dbpath;
	int c, found;
	char *p1;
	FILE *fp;

	if (argc < 6 ) {
		return(-EINVAL);
	}

	while ((c = getopt_long(argc, argv, "-cdfhlm:no:t:vy",
				long_options, &option_index)) != -1) {
		switch (c) {
		case 'c':
			lfsck_create++;
			break;
		case 'd':
			lfsck_delete++;
			break;
		case 'f':
			lfsck_force++;
			break;
		case 'h':
			lfsck_help++;
			break;
		case 'l':
			lfsck_save++;
			break;
		case 'm':
			VERBOSE(1, "MDSDB: %s\n", optarg);
			dbpath = malloc(PATH_MAX);
			if (dbpath == NULL) {
				fprintf(stderr, "error allocating dbpath\n");
				return -ENOMEM;
			}
			strcpy(tmp, optarg);
			if (realpath(my_dirname(tmp), dbpath) == NULL) {
				fprintf(stderr, "Failure to resolve path %s\n",
					optarg);
				free(dbpath);
				exit(1);
			}

			strcpy(tmp, optarg);
			sprintf(dbpath+strlen(dbpath), "/%s", my_basename(tmp));
			mds_file = dbpath;
			break;
		case 'n':
			lfsck_create = 0;
			lfsck_delete = 0;
			lfsck_save = 0;
			break;
		case 1:
			if (optind == argc) { /* last one is mountpoint */
				VERBOSE(1, "MOUNTPOINT: %s\n", optarg);
				path_name = optarg;
				break;
			}
			/* Otherwise it is another ostdb file */
		case 'o':
		{
			char *ost_path;

			VERBOSE(1, "OSTDB[%u]: %s\n", num_ost_files, optarg);
			p1 = optarg;
			do {
				dbpath = malloc(PATH_MAX);
				if (dbpath == NULL) {
					fprintf(stderr,
					       "error allocate ost_files[%d]\n",
					       num_ost_files);
					return -ENOMEM;
				}

				/* Old-style arguments are comma separated */
				ost_path = strsep(&p1, ",");
				strcpy(tmp, ost_path);
				if (realpath(my_dirname(tmp), dbpath) == NULL) {
					fprintf(stderr, "Failure to resolve "
						"path %s\n", optarg);
					for (c = 0; c < num_ost_files; c++)
						free(ost_files[c]);
					free(dbpath);
					exit(1);
				}

				strcpy(tmp, ost_path);
				sprintf(dbpath+strlen(dbpath), "/%s",
					my_basename(tmp));
				ost_files[num_ost_files] = dbpath;
				num_ost_files++;
			} while (p1 != NULL);
			break;
		}
		case 't':
			num_threads = strtol(optarg, NULL, 0);
			if (num_threads == ULONG_MAX)
				return (-EINVAL);

			if (num_threads > 128)
				num_threads = 128;
			if (num_threads == 0)
				num_threads = 1;
			break;
		case 'v':
			lfsck_verbose++;
			break;
		case 'y':
			lfsck_yes++;
			break;
		default:
			fprintf(stderr, "unknown option %c\n", c);
			return(-EINVAL);
		}
	}

	if (path_name == NULL)
		path_name = argv[optind];

	if (lfsck_yes && !lfsck_save && !lfsck_delete && !lfsck_create) {
		fprintf(stderr, "--yes requires either --save or --delete, or"
			"--create\n");
		return(-EINVAL);
	}

	if (realpath(path_name, mnt_path) == NULL) {
		fprintf(stderr, "error getting real mount path_name\n");
		return (-EINVAL);
	}
	fp = setmntent(MOUNTED, "r");
	if (fp == NULL) {
		fprintf(stderr, "error opening /etc/mtab\n");
		return (-EINVAL);
	}

	found = 0;
	while ((mnt_info = getmntent(fp)) != NULL) {
		if (strcmp(mnt_info->mnt_dir, mnt_path) == 0 &&
		    llapi_is_lustre_mnttype(mnt_info->mnt_type)) {
			found = 1;
			break;
		}
	}
	endmntent(fp);

	if (!found) {
		if (lfsck_force) {
			fprintf(stderr, "lfsck: %s unmounted?  forcing\n",
				mnt_path);
		} else {
			fprintf(stderr, "lfsck: %s not mounted\n", mnt_path);
			return(-EINVAL);
		}
	}

	if (mds_file == NULL || ost_files[0] == NULL) {
		fprintf(stderr, "--mdsdb or --ostdb unspecified\n");
		return(-EINVAL);
	}

	return(0);
}

/*
 * This is called from lfsck_get_path and also recursively.
 * This function is used on error paths when the name of an mds fid has
 * to be determined. It relies on the order of directory search that was
 * run in e2fsck when generating the mds_fid + containing dir table.
 * It searches through the table for the correct mds_fid. When this
 * is found a list of fids which are the fids of the directory tree up
 * to the containing directory of the sought for fid is returned.
 * When called recursively it continues search from the current point and
 * when the recursive call returns the search is continued from the
 * current search point as well. Basically is just traverses the list once.
 * For a file like <mntpt>/aaa/ccc/ddd the fids of aaa ccc and the fid
 * for ddd would also be returned.
 */
static int lfsck_get_fids(struct lu_fid *mds_fid, DB *mds_direntdb,
			  int depth, struct lfsck_fids *lfidp)
{
	struct lfsck_mds_dirent mds_dirent;
	int rc = 0;
	DBT key, data;

	memset(&key, 0, sizeof(key));
	memset(&data, 0, sizeof(data));
	key.data = mds_fid;
	key.size = sizeof(*mds_fid);
	data.data = &mds_dirent;
	data.size = data.ulen = sizeof(mds_dirent);
	data.flags = DB_DBT_USERMEM;

	rc = mds_direntdb->get(mds_direntdb, NULL, &key, &data, 0);
	if (rc && !lfsck_is_dirfid_root(mds_fid)) {
		log_write("Failed to find fid "DFID": %s\n", PFID(mds_fid),
			  db_strerror(rc));
		return (-ENOENT);
	}
	letocpu_mds_dirent(&mds_dirent);
	if (lfsck_is_dirfid_root(&mds_dirent.mds_dirfid)) {
		lfidp->fids = malloc(sizeof(*lfidp->fids) * (depth + 1));
		if (lfidp->fids == NULL) {
			return (-ENOMEM);
		}
		lfidp->depth = depth;
		lfidp->fids[depth] = mds_dirent.mds_fid;
		return (0);
	}
	rc = lfsck_get_fids(&mds_dirent.mds_dirfid, mds_direntdb,
			    depth + 1, lfidp);
	if (rc) {
		return(rc);
	}

	lfidp->fids[depth] = mds_dirent.mds_fid;
	return(0);
}

/* This function determines a path to a file given an mds fid.
 * The workhorse function is lfsck_get_fids which once given a
 * fid return a list of directory fids from the "root" directory to
 * the fid in question. Using these fids we can construct the path to
 * the file by using readir()
 */
static int lfsck_get_path(struct lu_fid *mds_fid, DB *mds_direntdb,
			  char *path, int path_len)
{
	struct lfsck_fids lfids;
	DIR *dir;
	struct dirent *dent;
	int rc, i;
	int cur_len = 0;

	VERBOSE(2, "lookup path for FID "DFID"\n", PFID(mds_fid));

	lfids.fids = NULL;
	lfids.depth = 0;

	rc = lfsck_get_fids(mds_fid, mds_direntdb, 0, &lfids);
	if (rc != 0) {
		rc = -ENOENT;
		goto out;
	}

	if (strlen(mnt_path) + 1 > path_len) {
		rc = -ENOMEM;
		goto out;
	}
	cur_len = strlen(mnt_path);
	path[strlen(mnt_path)] = 0;
	memcpy(path, mnt_path, strlen(mnt_path));
	/* Skip the first dir since this would be "ROOT" */
	rc = 0;
	for (i = lfids.depth - 1; i >= 0; i--) {
		dir = opendir(path);
		if (dir == NULL) {
			rc = -errno;
			goto out;
		}
		while (1) {
			char path_tmp[PATH_MAX];
			struct lu_fid fid;

			dent = readdir(dir);
			if (dent == NULL) {
				closedir(dir);
				rc = -ENOENT;
				goto out;
			}

			if (strlen(path) + strlen(dent->d_name) +2 >= PATH_MAX){
				closedir(dir);
				rc = -ENAMETOOLONG;
				goto out;
			}

			sprintf(path_tmp, "%s/%s", path, dent->d_name);
			rc = lfsck_path2fid(path_tmp, &fid);
			if (rc)
				continue;

			rc = 0;
			if (lfsck_fidcmp(&fid, &lfids.fids[i]) == 0) {
				if (cur_len + 1 + strlen(dent->d_name) >
				    path_len) {
					rc = -ENOMEM;
					closedir(dir);
					goto out;
				}
				path[cur_len] = '/';
				cur_len++;
				memcpy(&path[cur_len], dent->d_name,
				       strlen(dent->d_name));
				cur_len += strlen(dent->d_name);
				path[cur_len] = 0;
				closedir(dir);
				break;
			}
		}
	}
out:
	if (lfids.fids)
		free(lfids.fids);
	return(rc);
}

/*
 * Used by pass1 to save the ids of files which reference the same
 * objects. This is then used by pass4 to repair these files
 */
static int lfsck_save_duplicate(const struct lfsck_mds_objent *mds_obj)
{
	VERBOSE(2, "save duplicate object %u:"DOIF" FID "DFID"\n",
		mds_obj->mds_ostidx, POIF(&mds_obj->mds_oi),
		PFID(&mds_obj->mds_fid));

	pthread_mutex_lock(&phase_lock);
	if (lfsck_duplicates == NULL) {
		lfsck_duplicates = malloc(sizeof(*lfsck_duplicates) *
					  RLIMIT);
		if (lfsck_duplicates == NULL)
			return (-EINVAL);

	} else if (!((lfsck_dup_saved + 1) % RLIMIT)) {
		size_t size = (((lfsck_dup_saved + 1) / RLIMIT) + 1 ) *
			      sizeof(*lfsck_duplicates) * RLIMIT;
		void *tmp = realloc(lfsck_duplicates, size);

		if (tmp == NULL)
			return (-ENOMEM);

		lfsck_duplicates = tmp;
	}
	lfsck_duplicates[lfsck_dup_saved].ld_mds_fid = mds_obj->mds_fid;
	lfsck_duplicates[lfsck_dup_saved].ld_oi = mds_obj->mds_oi;
	lfsck_duplicates[lfsck_dup_saved].ld_ost_idx = mds_obj->mds_ostidx;
	lfsck_duplicates[lfsck_dup_saved].ld_link = NULL;
	lfsck_dup_saved++;
	pthread_mutex_unlock(&phase_lock);
	return(0);
}

/*
 * Check for duplicate ost objects on mds. Run through the table of
 * mds_fid/ost object to make sure that each ost object is only
 * refrenced by one mds entry. If a duplicate is found save the information
 * for repair in pass4
 */
int lfsck_run_pass1(__u32 ost_idx, DB *mds_ostdb, DB *ost_db, DB *mds_direntdb)
{
	int i = 0;
	int error = 0;
	int rc = 0;
	struct lfsck_mds_objent mds_obj, mds_obj2;
	unsigned long count = 0;
	DBT key, data;
	DBC *dbcp = NULL;

	log_write("%s: ost_idx %d: pass1: check for duplicate objects\n",
		  progname, ost_idx);

	if ((rc = mds_ostdb->cursor(mds_ostdb, NULL, &dbcp, 0)) != 0) {
		log_write("%s: error acquiring cursor for database: %s\n",
			  progname, db_strerror(rc));
		goto out;
	}
	memset(&key, 0, sizeof(key));
	memset(&data, 0, sizeof(data));
	data.data = &mds_obj;
	data.size = data.ulen = sizeof(mds_obj);
	data.flags = DB_DBT_USERMEM;

	while ((rc = dbcp->c_get(dbcp, &key, &data, DB_NEXT)) == 0)  {
		DBT data_dup;
		db_recno_t num_dup;

		count++;
		if ((rc = dbcp->c_count(dbcp, &num_dup, 0)) != 0) {
			log_write("%s: [%u] getting object refcount: %s\n",
				  progname, ost_idx, db_strerror(rc));
			rc = -EINVAL;
			goto out;
		}
		if (num_dup <= 1)
			continue;

		letocpu_mds_objent(&mds_obj);

		if (lfsck_save_duplicate(&mds_obj))
			fix_failed++;

		for (i = 1; i < num_dup; i++) {
			memset(&data_dup, 0, sizeof(data_dup));
			data_dup.data = &mds_obj2;
			data_dup.size = data_dup.ulen = sizeof(mds_obj2);
			data_dup.flags = DB_DBT_USERMEM;
			rc = dbcp->c_get(dbcp, &key,&data_dup,DB_NEXT);
			if (rc != 0) {
				log_write("%s: acquiring duplicate info: %s\n",
					  progname, db_strerror(rc));
				rc = -EINVAL;
				goto out;
			}
			letocpu_mds_objent(&mds_obj2);

			if (!lfsck_fidcmp(&mds_obj.mds_fid,&mds_obj2.mds_fid)) {
				log_write("%s: [%u] hard link on FID "DFID" is"
					  " not a duplicate object "DOIF"\n",
					  progname, PFID(&mds_obj.mds_fid),
					  ost_idx, POIF(&mds_obj.mds_oi));
				continue;
			}

			if (lfsck_save_duplicate(&mds_obj2))
				fix_failed++;
		}
	}

	if (rc != DB_NOTFOUND) {
		log_write("%s: error reading from inode database: %s\n",
			  progname, db_strerror(rc));
		rc = -EINVAL;
		goto out;
	}

	rc = 0;
	if (error == 0) {
		log_write("%s: ost_idx %d: pass1 OK (%lu files total)\n",
			  progname, ost_idx, count);
	} else {
		log_write("%s: ost_idx %d: pass1 ERROR: %d duplicate "
			  "entries found (fixed in pass4) (%lu files total)\n",
			  progname, ost_idx, error, count);
	}
out:
	if (dbcp)
		dbcp->c_close(dbcp);

	return(rc);
}

#ifndef LL_IOC_RECREATE_OBJ
#define LL_IOC_RECREATE_OBJ _IOW ('f', 157, long)          /* 1.x object IDs */
#endif
#ifndef LL_IOC_RECREATE_FID
#define LL_IOC_RECREATE_FID _IOW ('f', 157, struct lu_fid) /* 2.x FIDs */
#endif

/* If an MDS file is missing an object recreate object using an ioctl call */
static int lfsck_recreate_obj(int cmd, void *creat, struct ost_id *oi,
			      __u32 ost_idx, char *path)
{

	int fd;
	int rc;

	if (!lfsck_create) {
		log_write("[%u]: %s object %s "DOIF" not created\n", ost_idx,
			  path,cmd == LL_IOC_RECREATE_FID? "FID":"ID",POIF(oi));
		return 0;
	}

	fd = open(path, O_LARGEFILE | O_RDONLY, 0);
	if (fd < 0) {
		rc = -errno;
		log_write("[%u]: FAILED to open %s missing obj "DOIF"\n",
			  ost_idx, path, POIF(oi));
		fix_failed++;
		return rc;
	}

	rc = ioctl(fd, cmd, &creat);
	if (rc) {
		rc = -errno;
		log_write("[%u]: failed to recreate %s missing obj "DOIF"\n",
			  ost_idx, path, POIF(oi));
		fix_failed++;
	} else {
		log_write("[%u]: recreated %s missing obj "DOIF"\n",
			  ost_idx, path, POIF(oi));
		fixed++;
	}
	close(fd);

	return(rc);
}

/*
 * If size checking is enabled see if this ost is "adding" to the file size
 * if it is then just calculate the new size and save.
 */
static int lfsck_calc_size(struct lfsck_mds_objent *mds_obj,
			   struct lfsck_ost_objent *ost_obj,
			   DB *mds_sizeinfodb)
{
	int rc = 0;
#ifdef LFSCK_CHECK_SIZE
	struct lfsck_mds_szinfo mds_szinfo1;
	__u64 calc_size;
	DBT key, data;
	__u64 chunks, rem;

	if (ost_obj->ost_size == 0)
		return(0);

	pthread_mutex_lock(&size_lock);
	memset(&key, 0, sizeof(key));
	memset(&data, 0, sizeof(data));
	key.data = &mds_obj->mds_fid;
	key.size = sizeof(mds_obj->mds_fid);
	data.data = &mds_szinfo1;
	data.size = data.ulen = sizeof(mds_szinfo1);
	data.flags = DB_DBT_USERMEM;
	if ((rc = mds_sizeinfodb->get(mds_sizeinfodb, NULL, &key, &data, 0))) {
		log_write("Failure to get sizeinfo "LPU64"\n",mds_obj->mds_fid);
		pthread_mutex_unlock(&size_lock);
		return (-ENOENT);
	}
	letocpu_mds_szinfo(&mds_szinfo1);
	assert (mds_szinfo1.mds_stripe_pattern == LOV_PATTERN_RAID0);
	chunks = ost_obj->ost_size / mds_szinfo1.mds_stripe_size;
	rem = ost_obj->ost_size % mds_szinfo1.mds_stripe_size;
	if (rem == 0) {
		calc_size = (((chunks - 1 )* mds_szinfo1.mds_stripe_size)
			    * mds_szinfo1.mds_stripe_count);
		calc_size += mds_szinfo1.mds_stripe_size *
			     (mds_obj->mds_ostoffset + 1);
	} else {
		calc_size = ((chunks * mds_szinfo1.mds_stripe_size)
			    * mds_szinfo1.mds_stripe_count);
		if (mds_obj->mds_ostoffset == 0) {
			calc_size += rem;
		} else {
			calc_size += mds_szinfo1.mds_stripe_size *
				     mds_obj->mds_ostoffset;
			calc_size += rem;
		}
	}
	if (calc_size > mds_szinfo1.mds_calc_size) {
		mds_szinfo1.mds_calc_size = calc_size;
		memset(&key, 0, sizeof(key));
		memset(&data, 0, sizeof(data));
		key.data = &mds_obj->mds_fid;
		key.size = sizeof(mds_obj->mds_fid);
		data.data = &mds_szinfo1;
		data.size = sizeof(mds_szinfo1);
		cputole_mds_szinfo(&mds_szinfo1);
		/* Make sure we overwrite */
		if ((rc = mds_sizeinfodb->put(mds_sizeinfodb,
					      NULL, &key, &data, 0)) != 0) {
			log_write("Failure to update sizeinfo data\n");
			pthread_mutex_unlock(&size_lock);
			return (-EIO);
		}
	}
	pthread_mutex_unlock(&size_lock);
#endif /* LFSCK_CHECK_SIZE */
	return(rc);
}

/*
 * Check for dangling inode.
 * pass runs through the mds table for an ost and checks again the ost table
 * that the object refrenced on the mds exists on the ost
 */
int lfsck_run_pass2(__u32 ost_idx, struct lfsck_mds_hdr *mds_hdr,
		    DB *mds_ostdb, DB *ostdb,
		    DB *mds_direntdb, DB *mds_sizeinfodb)
{
	struct lfsck_mds_objent mds_obj;
	struct lfsck_ost_objent ost_obj;
	int error = 0, rc = 0;
	unsigned long count = 0;
	char *path;
	DBC *dbcp = NULL;
	DBT key, data;
	__u64 max_objid = mds_hdr->mds_max_ost_id[ost_idx];
	__u64 mds_connect_flags = 0;

	rc = llapi_get_connect_flags(mnt_path, &mds_connect_flags);
	/* Ignore the error here, and assume it is an older 1.8.x without
	 * LL_IOC_GET_CONNECT_FLAGS.  We only use this for 2.x detection. */

	log_write("lfsck: ost_idx %d: pass2: check for missing inode objects\n",
		  ost_idx);

	path = malloc(PATH_MAX);
	if (path == NULL) {
		log_write("lfsck: [%u]: pass2 ERROR: out of memory\n",
			   ost_idx);
		return (-ENOMEM);
	}

	rc = mds_ostdb->cursor(mds_ostdb, NULL, &dbcp, 0);
	if (rc != 0) {
		log_write("[%u]: error acquiring cursor for mds table: %s\n",
			  ost_idx, db_strerror(rc));
		rc = -EINVAL;
		goto out;
	}

	memset(&key, 0, sizeof(key));
	memset(&data, 0, sizeof(data));
	data.data = &mds_obj;
	data.size = data.ulen = sizeof(mds_obj);
	data.flags = DB_DBT_USERMEM;
	while ((rc = dbcp->c_get(dbcp, &key, &data, DB_NEXT)) == 0) {
		DBT key_ost, data_ost;

		count++;
		letocpu_mds_objent(&mds_obj);

		if (mds_hdr->mds_flags & E2F_OPT_READONLY &&
		    mds_obj.mds_oi.oi_id > max_objid) {
			VERBOSE(2, "[%u] skipping MDS FID "DFID": object "DOIF
				" > max "LPU64"\n", ost_idx,
				PFID(&mds_obj.mds_fid), POIF(&mds_obj.mds_oi),
				max_objid);
			continue;
		}

		memset(&key_ost, 0, sizeof(key_ost));
		memset(&data_ost, 0, sizeof(data_ost));
		key_ost.data = &mds_obj.mds_oi;
		key_ost.size = sizeof(mds_obj.mds_oi);
		data_ost.data = &ost_obj;
		data_ost.size = data_ost.ulen = sizeof(ost_obj);
		data_ost.flags = DB_DBT_USERMEM;
		rc = ostdb->get(ostdb, NULL, &key_ost, &data_ost, 0);
		if (rc != 0) {
			letocpu_ost_objent(&ost_obj);
			if (rc == DB_NOTFOUND) {
				struct lu_fid fid;
				struct create18 {
					__u64 lrc_id;
					__u32 lrc_ost_idx;
				} create18;
				void *create;
				int cmd;

				if (lfsck_get_path(&mds_obj.mds_fid,
						   mds_direntdb,path,PATH_MAX)){
					VERBOSE(1,"[%u]: MDS FID "DFID" object "
						DOIF" deleted?\n", ost_idx,
						PFID(&mds_obj.mds_fid),
						POIF(&mds_obj.mds_oi));
					continue;
				}
				error++;
				if (mds_connect_flags & OBD_CONNECT_FID) {
					ostid_idif_unpack(&mds_obj.mds_oi,
							  &fid, ost_idx);
					create = &fid;
					cmd = LL_IOC_RECREATE_FID;
				} else {
					create18.lrc_id = mds_obj.mds_oi.oi_id;
					create18.lrc_ost_idx = ost_idx;
					create = &create18;
					cmd = LL_IOC_RECREATE_OBJ;
				}

				lfsck_recreate_obj(cmd, create,&mds_obj.mds_oi,
						   ost_idx, path);
			} else {
				log_write("[%u]: error looking up object "DOIF
					  ": %s\n", ost_idx,
					  POIF(&mds_obj.mds_oi),
					  db_strerror(rc));
				rc = -EINVAL;
				goto out;
			}
		}
		if (lfsck_calc_size(&mds_obj, &ost_obj, mds_sizeinfodb)) {
			log_write("[%u]: error updating file size for object "
				  DOIF": %s\n", ost_idx, POIF(&mds_obj.mds_oi),
				  strerror(rc));
			rc = -EINVAL;
			goto out;
		}
	}
	if (rc != DB_NOTFOUND) {
		log_write("[%u]: error getting next inode: %s\n",
			  ost_idx, db_strerror(rc));
		rc = -EINVAL;
		goto out;
	}
	rc = 0;
	if (error == 0) {
		log_write("lfsck: ost_idx %d: pass2 OK (%lu objects)\n",
			  ost_idx, count);
	} else {
		log_write("lfsck: ost_idx %d: pass2 ERROR: %d dangling inodes "
			  "found (%lu files total)\n", ost_idx, error, count);
	}

out:
	dbcp->c_close(dbcp);
	free(path);
	return(0);
}

/*
 * If an object exists on an ost but is not referenced by an entry on the mds
 * then create a lost+found entry and set the EA on the file so that the
 * orphaned object is picked up. If the object is requested to be deleted
 * an unlink on this lost+found file will now delete same
 */
int lfsck_fix_orphan(__u32 ost_idx, struct ost_id *oi,
		     struct obd_uuid *uuid, int delete)
{
	struct lov_user_md *lum;
	char file[PATH_MAX];
	int fd, lum_size, rc = 0;
	struct utimbuf utimbuf = { 0, 0 };

	lum_size = LOV_EA_SIZE(lum, 1);

	lum = malloc(lum_size);
	if (lum == NULL) {
		log_write("%s: out of memory on EA (%u) orphan %u:"DOIF"\n",
			  progname, lum_size, ost_idx, POIF(oi));
		return(-ENOMEM);
	}

	memset(file, 0, PATH_MAX);
	sprintf(file, "%s/%s-"DOIF, lostandfounddir, uuid->uuid, POIF(oi));

	fd = open(file, O_CREAT|O_EXCL|O_LOV_DELAY_CREATE, 0600);
	if (fd < 0) {
		rc = -errno;
		log_write("%s: unable to open %s for orphan %u:"DOIF": %s\n",
			  progname, file, ost_idx, POIF(oi), strerror(-rc));
		goto out_free;
	}
	lum->lmm_magic = LOV_USER_MAGIC;
	lum->lmm_pattern = LOV_PATTERN_RAID0;
	lum->lmm_stripe_size = 1048576;
	lum->lmm_stripe_offset = 0;
	lum->lmm_stripe_count = 1;
	lum->lmm_objects[0].l_object_id = oi->oi_id;
	lum->lmm_objects[0].l_object_seq = oi->oi_seq;
	lum->lmm_objects[0].l_ost_gen = 0;
	lum->lmm_objects[0].l_ost_idx = ost_idx;

	/* reset the MDS timestamps so we can see the OST timestamps */
	utime(file, &utimbuf);

	if (ioctl(fd, LL_IOC_LOV_SETEA, lum) < 0) {
		rc = -errno;
		log_write("%s: unable to open %s for orphan %u:"DOIF": %s\n",
			  progname, file, ost_idx, POIF(oi), strerror(-rc));
	}

	close(fd);
	if (rc != 0 || delete) {
		int err = unlink(file);
		if (err != 0 && errno != ENOENT) {
			rc = rc ? rc : -errno;
			log_write("%s: failed to unlink %s for orphan %u:"DOIF
				  ": %s\n", progname, file, ost_idx,
				  POIF(oi), strerror(-rc));
		}
	}
out_free:
	free(lum);
	return(rc);
}

/*
 * Check for orphans
 * Run through each entry in ost table and check the mds ost table for
 * a corresponding entry. If not found report and repair.
 */
int lfsck_run_pass3(__u32 ost_idx, DB *mds_ostdb, DB *ostdb,
		    struct obd_uuid *uuid, __u64 last_id)
{
	int error = 0, rc = 0;
	struct lfsck_mds_objent mds_obj;
	struct lfsck_ost_objent ost_obj;
	unsigned long count = 0;
	DBT key, data;
	DBC *dbcp = NULL;
	__u64 bytes = 0;

	log_write("lfsck: ost_idx %d: pass3: check for orphan objects\n",
		  ost_idx);

	VERBOSE(1, "[%u] uuid %s\n", ost_idx, uuid->uuid);
	VERBOSE(1, "[%u] last_id "LPU64"\n", ost_idx, last_id);

	rc = ostdb->cursor(ostdb, NULL, &dbcp, 0);
	if (rc != 0) {
		log_write("[%u]: error acquiring cursor for mds table: %s\n",
			  ost_idx, db_strerror(rc));
		rc = -EINVAL;
		goto out;
	}

	memset(&key, 0, sizeof(key));
	memset(&data, 0, sizeof(data));
	data.data = &ost_obj;
	data.size = data.ulen = sizeof(ost_obj);
	data.flags = DB_DBT_USERMEM;

	while ((rc = dbcp->c_get(dbcp, &key, &data, DB_NEXT)) == 0) {
		DBT key_mdt, data_mdt;
		struct ost_id *oi;

		count++;
		letocpu_ost_objent(&ost_obj);
		oi = &ost_obj.ost_oi;

		if (oi->oi_id > last_id) {
			VERBOSE(2, "[%u] skipping objid "DOIF" > "LPU64"\n",
				ost_idx, POIF(oi), last_id);
			continue;
		}
		VERBOSE(2, "[%u] processing objid "DOIF"\n", ost_idx, POIF(oi));

		memset(&key_mdt, 0, sizeof(key_mdt));
		memset(&data_mdt, 0, sizeof(data_mdt));
		key_mdt.data = oi;
		key_mdt.size = sizeof(*oi);
		data_mdt.data = &mds_obj;
		data_mdt.size = data_mdt.ulen = sizeof(mds_obj);
		data_mdt.flags = DB_DBT_USERMEM;
		rc = mds_ostdb->get(mds_ostdb, NULL, &key_mdt, &data_mdt, 0);
		if (rc == 0) {
			VERBOSE(2, "[%u] found object "DOIF" reference\n",
				ost_idx, POIF(oi));
			continue;
		}

		letocpu_mds_objent(&mds_obj);
		if (rc != DB_NOTFOUND) {
			log_write("Failed to check mds db for entry\n");
			rc = -EINVAL;
			goto out;
		}
		if (ost_obj.ost_size == 0) {
			/* don't report errors for normal orphan recovery */
			VERBOSE(1, "[%u] zero-length orphan objid "DOIF"\n",
				ost_idx, POIF(oi));
			if (lfsck_save || lfsck_delete) {
				/* No reason to save just delete*/
				rc = lfsck_fix_orphan(ost_idx, oi, uuid, 1);
				if (rc) {
					log_write("lfsck: [%u]: pass3 "
						  "error fixing zero-length "
						  "orphan objid "DOIF"\n",
						  ost_idx, POIF(oi));
					fix_failed++;
				} else {
					fixed++;
				}
			}
			continue;
		}

		error++;
		bytes += ost_obj.ost_bytes;
		if (lfsck_save || lfsck_delete) {
			rc = lfsck_fix_orphan(ost_idx, oi, uuid, lfsck_delete);
			if (rc) {
				log_write("lfsck: [%u]: failed to fix orphan "
					  "object "DOIF", "LPU64" bytes\n",
					  ost_idx, POIF(oi),ost_obj.ost_bytes);
				fix_failed++;
			} else {
				log_write("lfsck: [%u]: pass3 %s orphan object "
					  DOIF", "LPU64" bytes\n", ost_idx,
					  lfsck_save ? "saved" : "unlinked",
					  POIF(oi), ost_obj.ost_bytes);
				fixed++;
			}
		} else {
			error++;
			log_write("lfsck: [%u]: pass3 orphan found objid "
				  DOIF", "LPU64" bytes\n", ost_idx,
				  POIF(oi), ost_obj.ost_bytes);
		}
	}
	if (rc != DB_NOTFOUND) {
		log_write("[%u]: error getting next object in db %d\n",
			  ost_idx, db_strerror(rc));
		rc = -EINVAL;
		goto out;
	}

	if (error == 0) {
		log_write("lfsck: ost_idx %d: pass3 OK (%lu files total)\n",
			  ost_idx, count);
	} else {
		log_write("lfsck: ost_idx %d: pass3 %s: %4gMB of orphan "
			  "data (%lu of %lu files total)\n", ost_idx,
			  (lfsck_save | lfsck_delete) ? "FIXED" : "ERROR",
			  (double)bytes / (1024 * 1024), error, count);
	}
out:
	if (dbcp)
		dbcp->c_close(dbcp);
	return (0);
}

/* Missing ost information report affected file names */
int lfsck_list_affected_files(char *mds_file, struct lfsck_mds_hdr *mds_hdr,
			      DB *mds_direntdb, __u32 ost_idx)
{
	struct lfsck_mds_objent mds_obj;
	char dbname[256];
	char *path;
	DB *mds_db = NULL;
	DBT key,data;
	DBC *dbcp = NULL;
	int rc = 0;

	path = malloc(PATH_MAX);
	if (path == NULL) {
		return (-ENOMEM);
	}

	sprintf(dbname, "%s.%d", MDS_OSTDB, ost_idx);
	if ((rc = lfsck_opendb(mds_file, dbname, &mds_db, 1, 0, 0)) != 0) {
		log_write("%s: failed to open mds db file %s: rc %d\n",
			  progname, mds_file, rc);
		rc = -EINVAL;
		goto out;
	}

	if ((rc = mds_db->cursor(mds_db, NULL, &dbcp, 0)) != 0) {
		log_write("Failed to acquire cursor for mds table\n");
		rc = -EINVAL;
		goto out;
	}

	memset(&key, 0, sizeof(key));
	memset(&data, 0, sizeof(data));
	data.data = &mds_obj;
	data.size = data.ulen = sizeof(mds_obj);
	data.flags = DB_DBT_USERMEM;

	log_write("Files affected by missing ost info are : -\n");
	while ((rc = dbcp->c_get(dbcp, &key, &data, DB_NEXT)) == 0) {
		letocpu_mds_objent(&mds_obj);

		if (lfsck_get_path(&mds_obj.mds_fid, mds_direntdb,
				   path, PATH_MAX)) {
			log_write("Failed to get path for fid "LPU64"\n",
				  mds_obj.mds_fid);
			fix_failed++;
		} else {
			log_write("%s\n",path);
		}
	}
	if (rc != DB_NOTFOUND) {
		log_write("Error getting next element in db %d\n", rc);
		rc = -EINVAL;
		goto out;
	}
	rc = 0;

out:
	if (dbcp)
		dbcp->c_close(dbcp);
	if (mds_db)
		mds_db->close(mds_db, 0);
	return(rc);
}

/*
 * For each ost index run checks 1 2 and 3.
 * 1) Check for object referenced by more than one file
 * 2) Check that objects exist on ost
 * 3) Check that containg mds entry exists for an object
 */
int run_test(__u32 ost_idx, struct lfsck_mds_hdr *mds_hdr,
	     DB *mds_direntdb, DB *mds_sizeinfodb )
{
	struct lfsck_ost_hdr *ost_hdr = NULL;
	char dbname[256];
	DB *mds_ostdb = NULL;
	DB *ost_db = NULL;
	DBT key, data;
	__u64 last_id;
	int i, rc;

	sprintf(dbname, "%s.%d", MDS_OSTDB, ost_idx);

	VERBOSE(2, "testing ost_idx %d\n", ost_idx);

	rc = lfsck_opendb(mds_file, dbname, &mds_ostdb, 1, 0, 0);
	if (rc != 0) {
		log_write("%s: failed to open mds db file %s: rc %d\n",
			  progname, mds_file, rc);
		goto out;
	}

	ost_hdr = malloc(sizeof(*ost_hdr));
	if (ost_hdr == NULL) {
		log_write("Failure to alloc memory\n");
		rc = -ENOMEM;
		goto out;
	}


	VERBOSE(2, "looking for index %u UUID %s\n", ost_idx,
		lfsck_uuid[ost_idx].uuid);

	for (i = 0; i < num_ost_files; i++) {
		VERBOSE(2, "checking file %s\n", ost_files[i]);
		rc = lfsck_opendb(ost_files[i], OST_HDR, &ost_db, 0, 0, 0);
		if (rc != 0) {
			log_write("%s: error opening ost_data_file %s: rc %d\n",
				  progname, ost_files[i], rc);
			goto out;
		}
		memset(&key, 0, sizeof(key));
		memset(&data, 0, sizeof(data));
		ost_hdr->ost_magic = OST_MAGIC;
		key.data = &ost_hdr->ost_magic;
		key.size = sizeof(ost_hdr->ost_magic);
		data.size = data.ulen = sizeof(*ost_hdr);
		data.data = ost_hdr;
		data.flags = DB_DBT_USERMEM;

		rc = ost_db->get(ost_db, NULL, &key, &data, 0);
		ost_db->close(ost_db, 0);
		ost_db = NULL;
		if (rc != 0) {
			log_write("Invalid ost magic on file %s: rc %s\n",
				  ost_files[i], db_strerror(rc));
			continue;
		}

		letocpu_ost_hdr(ost_hdr);
		VERBOSE(2, "%s has ost UUID %s\n", ost_files[i],
			ost_hdr->ost_uuid.uuid);

		if (obd_uuid_equals(&lfsck_uuid[ost_idx], &ost_hdr->ost_uuid)) {
			if (ost_hdr->ost_index != ost_idx) {
				log_write("Requested ost_idx %u doesn't match "
					  "index %u found in %s\n", ost_idx,
					  ost_hdr->ost_index, ost_files[i]);
				continue;
			}

			break;
		}
	}

	if (i == num_ost_files) {
		log_write("lfsck: can't find file for ost_idx %d\n", ost_idx);
		rc = lfsck_list_affected_files(mds_file, mds_hdr,
					       mds_direntdb, ost_idx);
		goto out;
	}
	rc = lfsck_opendb(ost_files[i], OST_OSTDB, &ost_db, 0, 0, 0);
	if (rc != 0) {
		log_write("%s: error opening ost_data_file %s: rc %d\n",
			  progname, ost_files[i], rc);
		goto out;
	}

	VERBOSE(1, "MDS: max_id "LPU64" OST: max_id "LPU64"\n",
		mds_hdr->mds_max_ost_id[ost_idx], ost_hdr->ost_last_id);

	rc = lfsck_run_pass1(ost_idx, mds_ostdb, ost_db, mds_direntdb);
	if (rc != 0) {
		log_write("error in running pass1\n");
		goto out;
	}

	rc = lfsck_run_pass2(ost_idx, mds_hdr, mds_ostdb, ost_db, mds_direntdb,
			     mds_sizeinfodb);
	if (rc != 0) {
		log_write("error in running pass2\n");
		goto out;
	}

	last_id = (ost_hdr->ost_flags & E2F_OPT_READONLY ||
		   mds_hdr->mds_flags & E2F_OPT_READONLY) ?
			mds_hdr->mds_max_ost_id[ost_idx] : ost_hdr->ost_last_id;

	rc = lfsck_run_pass3(ost_idx, mds_ostdb, ost_db, &ost_hdr->ost_uuid,
			     last_id);
	if (rc != 0) {
		log_write("error in running pass3\n");
		goto out;
	}
	rc = 0;

out:
	if (ost_hdr)
		free(ost_hdr);
	if (mds_ostdb)
		mds_ostdb->close(mds_ostdb, 0);
	if (ost_db)
		ost_db->close(ost_db, 0);

	return(rc);
}

static int lfsck_validate_duplicate(struct lfsck_saved_duplicates *dup,
				    const char *path)
{
	struct lov_user_md *lum;
	struct lov_user_ost_data_v1 *loi;
	struct stat64 st;
	struct lu_fid fid;
	int rc, i;

	VERBOSE(2,"[%u] check duplicate FID "DFID" object "DOIF"\n  for\t%s\n",
		dup->ld_ost_idx, PFID(&dup->ld_mds_fid),POIF(&dup->ld_oi),path);

	/* first, validate that the paths are still valid */
	if (stat64(path, &st) < 0) {
		rc = -errno;
		log_write("%s: duplicate file %s error: %s\n",
			  progname, path, strerror(-rc));
		return rc;
	}

	rc = lfsck_path2fid(path, &fid);
	if (rc < 0) {
		log_write("%s: unable to get LMA EA on %s: %s\n",
			  progname, path, strerror(-rc));
		return rc;
	}
	if (lfsck_fidcmp(&dup->ld_mds_fid, &fid)) {
		log_write("%s: duplicate file %s is no longer FID "DFID"\n",
			  progname, path, PFID(&dup->ld_mds_fid));
		return -EBADF;
	}

	lum = malloc(LOV_EA_MAX(lum));
	if (lum == NULL) {
		log_write("%s: out of memory allocating LOV EA (%u)\n",
			  progname, LOV_EA_MAX(lum));
		return -ENOMEM;
	}

	rc = llapi_file_get_stripe(path, lum);
	if (rc < 0) {
		log_write("%s: unable to get LOV EA on %s: %s\n",
			  progname, path, strerror(-rc));
		goto out;
	}

	if (lum->lmm_pattern != LOV_PATTERN_RAID0) {
		log_write("%s: unknown LOV stripe pattern %#08x\n",
			  progname, lum->lmm_pattern);
		rc = -EINVAL;
		goto out;
	}

	if (lum->lmm_magic == LOV_USER_MAGIC_V1) {
		loi = lum->lmm_objects;
	} else if (lum->lmm_magic == LOV_USER_MAGIC_V3) {
		loi = ((struct lov_user_md_v3 *)lum)->lmm_objects;
	} else {
		log_write("%s: unknown LOV magic %#08x\n",
			  progname, lum->lmm_magic);
		rc = -EINVAL;
		goto out;
	}

	/* Verify that the object in question is still in the file */
	for (i = 0; i < lum->lmm_stripe_count; i++, loi++) {
		if (loi->l_ost_idx == dup->ld_ost_idx &&
		    loi->l_object_id == dup->ld_oi.oi_id &&
		    loi->l_object_seq == dup->ld_oi.oi_seq)
			break;
	}

	if (i == lum->lmm_stripe_count) {
		log_write("%s: cannot find object %u:"DOIF" in\n\t%s\n",
			  progname, dup->ld_ost_idx, POIF(&dup->ld_oi), path);
		rc = -EBADF;
		goto out;
	}

out:
	free(lum);
	return rc;
}

#ifndef HAVE_LLAPI_CANCEL_OSC_LOCKS
#define NAMESPACES "/proc/fs/lustre/ldlm/namespaces"
void llapi_cancel_osc_locks(const char *mnt_path)
{
	DIR *namespaces;
	char path[PATH_MAX];
	struct dirent *dent;
	int rc, fd;

	namespaces = opendir(NAMESPACES);
	if (namespaces == NULL) {
		rc = -errno;
		log_write("%s: error opening %s: %s\n",
			  progname, NAMESPACES, strerror(-rc));
		return;
	}

	while ((dent = readdir(namespaces)) != NULL) {
		if (strcmp(dent->d_name, ".") == 0 ||
		    strcmp(dent->d_name, "..") == 0)
			continue;

		if (strstr(dent->d_name, "osc") == NULL &&
		    strstr(dent->d_name, "OSC") == NULL)
			continue;

		snprintf(path, sizeof(path) - 1, "%s/%s/lru_size",
			 NAMESPACES, dent->d_name);

		fd = open(path, O_WRONLY);
		if (fd < 0) {
			log_write("%s: error opening %s to cancel locks: %s\n",
				  progname, path, strerror(errno));
			continue;
		}
		VERBOSE(3, "clearing locks in %s\n", path);
		rc = write(fd, "clear", 6);
		close(fd);
	}

	closedir(namespaces);
}
#endif

/* Remove inodes from the client cache to avoid hitting an LASSERTF() on
 * the client if it tries to attach two inodes to the same object */
static void lfsck_drop_caches(void)
{
	int fd, rc;

	sync();
	llapi_cancel_osc_locks(mnt_path);
	fd = open("/proc/sys/vm/drop_caches", O_WRONLY);
	if (fd < 0)
		return;
	VERBOSE(3, "flushing vm cache\n");
	rc = write(fd, "3", 2);
	close(fd);
}

/* Duplicate an object that is referenced by multiple files and point one
 * of the files to use the duplicated object */
static int lfsck_fix_duplicate(struct lfsck_saved_duplicates *dup,
			       DB *mds_direntdb, const char *path)
{
	char path_tmp[PATH_MAX] = { 0 };
	char tmp[PATH_MAX * 2 + 10] = { 0 };
	const char *base;
	int rc;

	lfsck_drop_caches();

	if (!lfsck_create) {
		VERBOSE(1, "%s: [%u]: not duplicating FID "DFID
			" object "DOIF" by request\n  on\t%s\n",
			progname, dup->ld_ost_idx, PFID(&dup->ld_mds_fid),
			POIF(&dup->ld_oi), path);
		return 0;
	}

	rc = lfsck_validate_duplicate(dup, path);
	if (rc < 0)
		goto out;

	snprintf(path_tmp, sizeof(path_tmp) - 1, "%s.lfsck_tmp", path);
	snprintf(tmp, sizeof(tmp) - 1, "cp -p '%s' '%s'", path, path_tmp);
	VERBOSE(2, "%s\n", tmp);
	rc = system(tmp);
	if (rc) {
		rc = -errno;
		log_write("%s: duplicating object for %u:"DOIF" %s: %s\n",
			  progname, dup->ld_ost_idx, POIF(&dup->ld_oi),
			  path, strerror(-rc));
		goto out;
	}

	base = strrchr(path, '/');
	if (base == NULL)
		base = path;
	else
		base++;

	rc = asprintf(&dup->ld_link, "%s/%u-"DOIF"-"DFID":%s", dupedir,
		      dup->ld_ost_idx, POIF(&dup->ld_oi),
		      PFID(&dup->ld_mds_fid), base);
	if (rc < 0) {
		rc = -errno;
		goto out;
	}

	VERBOSE(2, "ln %s %s\n", path, dup->ld_link);
	rc = link(path, dup->ld_link);
	if (rc) {
		rc = -errno;
		log_write("%s: error linking %s to %s: %s\n",
			  progname, path, dup->ld_link, strerror(-rc));
		free(dup->ld_link);
		dup->ld_link = NULL;
		goto out;
	}

	VERBOSE(2, "rename %s %s\n", path_tmp, path);
	rc = rename(path_tmp, path);
	if (rc) {
		rc = -errno;
		log_write("%s: error renaming %s to %s: %s\n",
			  progname, path_tmp, path, strerror(-rc));
		free(dup->ld_link);
		dup->ld_link = NULL;
	} else {
		log_write("%s: [%u]: fixed duplicate FID "DFID" object "
			  DOIF":\n\t%s\n", progname, dup->ld_ost_idx,
			  PFID(&dup->ld_mds_fid), POIF(&dup->ld_oi), path);
	}
out:
	if (rc) {
		VERBOSE(2, "unlink %s\n", path_tmp);
		if (unlink(path_tmp))
			log_write("%s: unlink %s failed: %s\n", progname,
				  path_tmp, strerror(errno));
	}

	return rc;
}

/*
 * Check for files found that reference the same ost objects
 * (found in pass1) and repair now if necessary
 */
int lfsck_run_pass4(DB *mds_direntdb)
{
	char tmp[PATH_MAX + 512];
	char path[PATH_MAX];
	int failed_get_path, fixed_dup;
	int i, j;

	log_write("lfsck: pass4: check for %u duplicate object references\n",
		  lfsck_dup_saved);
	if (lfsck_dup_saved == 0) {
		log_write("lfsck: pass4 OK (no duplicates)\n");
		return(0);
	}

	do {
		struct lu_fid *mds_fid;

		failed_get_path = 0;
		fixed_dup = 0;

		lfsck_drop_caches();

		for (i = 0; i < lfsck_dup_saved; i++) {
			mds_fid = &lfsck_duplicates[i].ld_mds_fid;
			if (mds_fid->f_oid == 0)
				continue;

			if (lfsck_get_path(mds_fid, mds_direntdb, path,
					   sizeof(path))) {
				failed_get_path++;
				continue;
			}
			if (lfsck_fix_duplicate(&lfsck_duplicates[i],
						mds_direntdb, path)) {
				fix_failed++;
				continue;
			}

			fixed++;
			fixed_dup++;

			/* don't duplicate a file multiple times even if it has
			 * multiple shared objects */
			for (j = i + 1; j < lfsck_dup_saved; j++) {
				if (lfsck_fidcmp(&lfsck_duplicates[j].ld_mds_fid,
						 mds_fid) == 0)
					lfsck_duplicates[j].ld_mds_fid.f_oid =0;
			}
			lfsck_duplicates[i].ld_mds_fid.f_oid = 0;
		}
	} while (failed_get_path && fixed_dup);

	for (i = 0; i < lfsck_dup_saved; i++) {
		lfsck_drop_caches();

		if (!lfsck_duplicates[i].ld_link)
			continue;

		if (unlink(lfsck_duplicates[i].ld_link))
			log_write("%s: failed to unlink %s: %s\n", progname,
				  lfsck_duplicates[i].ld_link, strerror(errno));
		else
			log_write("%s: %s unlinked\n", progname,
				  lfsck_duplicates[i].ld_link);
		free(lfsck_duplicates[i].ld_link);
		lfsck_duplicates[i].ld_link = NULL;
	}

	snprintf(tmp, sizeof(tmp) - 1, "rm -rvf '%s'", dupedir);
	VERBOSE(1, "%s\n", tmp);
	if (system(tmp) == -1)
		VERBOSE(1, "%s failed", tmp);

	log_write("lfsck: pass4 finished\n");

	return(0);
}

/*
 * This is a placeholder to check for filesize correctness no fixup is in
 * place right now since file size is still obtained from osts
 */
int lfsck_run_pass5(DB *mds_direntdb, DB *mds_sizeinfodb)
{
	int rc = 0;
#ifdef LFSCK_CHECK_SIZE
	struct lfsck_mds_szinfo mds_szinfo1;
	char path[PATH_MAX];
	struct stat64 statbuf;
	DBT key,data;
	DBC *dbcp;

	log_write("lfsck: pass5: file size correctness\n");

	if ((rc = mds_sizeinfodb->cursor(mds_sizeinfodb, NULL, &dbcp, 0)) != 0){
		log_write("%s: error acquiring cursor for database: %s\n",
			  progname, db_strerror(rc));
		rc = -EINVAL;
		goto out;
	}
	memset(&key, 0, sizeof(key));
	memset(&data, 0, sizeof(data));
	data.data = &mds_szinfo1;
	data.size = data.ulen = sizeof(mds_szinfo1);
	data.flags = DB_DBT_USERMEM;
	while ((rc = dbcp->c_get(dbcp, &key, &data, DB_NEXT)) == 0) {
		letocpu_mds_szinfo(&mds_szinfo1);

		if (mds_szinfo1.mds_size != mds_szinfo1.mds_calc_size) {
			if (lfsck_get_path(mds_szinfo1.mds_fid, mds_direntdb,
					   path, sizeof(path))) {
				log_write("%s: failed to get path and update "
					  "size for fid "LPU64"\n",
					  mds_szinfo1.mds_fid);
				fix_failed++;
				continue;
			}

			if (stat64(path, &statbuf)) {
				log_write("%s: pass5: failed to stat %s\n",
					  progname, path);
				fix_failed++;
				continue;
			}
			if (statbuf.st_size == mds_szinfo1.mds_calc_size) {
				VERBOSE(2, "%s: %s really has right size\n",
					progname, path);
			} else {
				log_write("%s: %s size "LPU64" != "LPU64"\n",
					  progname, path, statbuf.st_size,
					  mds_szinfo1.mds_calc_size);
				fixed++;
			}
		}
	}
	if (rc != DB_NOTFOUND) {
		log_write("%s: error getting next element in db: %s\n",
			  progname, db_strerror(rc));
		rc = -EINVAL;
		goto out;
	}
	rc = 0;
	log_write("%s: pass5 finished\n", progname);
out:
	dbcp->c_close(dbcp);
#endif /* LFSCK_CHECK_SIZE */
	return rc;
}

int get_response()
{
	char   yes[] = "Yy";
	char   no[] = "Nn";
	char c;
	int rc = -1;

	while (1) {
		c = getchar();
		if ( c == EOF)
			break;

		if (strchr(yes, c)) {
			rc = 1;
		        break;
		}

		if (strchr(no, c)) {
			rc = 0;
			break;
		}
	}
	return(rc);
}

/* Starting point for each thread */
void *lfsck_start_thread(void *arg)
{
	struct lfsck_thread_info *tinfo = (struct lfsck_thread_info *)arg;
	int i,rc;

	tinfo->status = 0;
	pthread_mutex_lock(&init_mutex);
	if (all_started)
		pthread_mutex_unlock(&init_mutex);
	else
		pthread_cond_wait(&init_cond, &init_mutex);

	if (!all_started)
		pthread_exit(NULL);
	for (i = tinfo->start_ost_idx; i < tinfo->end_ost_idx; i++) {
		rc = run_test(i, tinfo->mds_hdr, tinfo->mds_direntdb,
			      tinfo->mds_sizeinfodb);
		if (rc) {
			log_write("lfsck: ost_idx %d: error running check\n",i);
			tinfo->status = rc;
		}
	}
	pthread_exit(NULL);
}

/* Start threads and run filesystem checks and repair */
int lfsck_run_checks()
{
	struct lfsck_mds_hdr *mds_hdr = NULL;
	struct lfsck_thread_info *tinfo = NULL;
	pthread_t *threads = NULL;
	int rc, i;
	DB *mds_direntdb = NULL;
	DB *mds_hdrdb = NULL;
	DB *mds_sizeinfodb = NULL;
	DBT key, data;
	int num_osts;

	rc = lfsck_opendb(mds_file, MDS_HDR, &mds_hdrdb, 0, 0, 0);
	if (rc != 0) {
		log_write("%s: error opening mds_hdr in %s: rc %d\n",
			  mds_file, rc);
		return(-EINVAL);
	}
	mds_hdr = malloc(sizeof(*mds_hdr));
	if (mds_hdr == NULL) {
		log_write("%s: out of memory allocating DB header (%u)\n",
			  progname, sizeof(*mds_hdr));
		rc = -ENOMEM;
		goto out;
	}
	memset(&key, 0, sizeof(key));
	memset(&data, 0, sizeof(data));
	mds_hdr->mds_magic = MDS_MAGIC;
	key.data = &mds_hdr->mds_magic;
	key.size = sizeof(mds_hdr->mds_magic);
	data.data = mds_hdr;
	data.size = sizeof(*mds_hdr);
	data.ulen = sizeof(*mds_hdr);
	data.flags = DB_DBT_USERMEM;
	rc = mds_hdrdb->get(mds_hdrdb, NULL, &key, &data, 0);
	if (rc != 0) {
		log_write("%s: error getting mds_hdr info %s: %s\n",
			  progname, mds_file, db_strerror(rc));
		goto out;
	}
	letocpu_mds_hdr(mds_hdr);

	rc = lfsck_opendb(mds_file, MDS_DIRINFO, &mds_direntdb, 0, 0, 0);
	if (rc != 0) {
		log_write("%s: error opening dirinfo db %s: rc %d\n",
			  progname, mds_file, rc);
		goto out;
	}

	rc = lfsck_opendb(mds_file, MDS_SIZEINFO, &mds_sizeinfodb, 0, 0, 0);
	if (rc != 0) {
		log_write("%s: error opening sizeinfo db %s: rc %d\n",
			  progname, mds_file, rc);
		goto out;
	}

	if (lov_tgt_count > mds_hdr->mds_num_osts) {
		fprintf(stderr, "%s: number of osts in lov (%u) > "
				"num referenced in mds (%u) (new ost or "
				"empty filesystem?)\n", progname,
				lov_tgt_count, mds_hdr->mds_num_osts);
		fprintf(stderr, "Do you wish to continue? (y/n)\n");
		if ((rc = get_response()) != 1) {
			log_write("%s: exiting \n", progname);
			goto out;
		}
		fprintf(stderr, "\n");

		num_osts = lov_tgt_count;
	} else {
		num_osts = mds_hdr->mds_num_osts;
	}
	if (num_threads > num_osts)
		num_threads = num_osts;

	tinfo = calloc(num_threads, sizeof(*tinfo));
	if (tinfo == NULL) {
		log_write("%s: out of memory for thread info\n", progname);
		rc = -ENOMEM;
		goto out;
	}
	threads = calloc(num_threads, sizeof(pthread_t));
	if (threads == NULL) {
		log_write("%s: out of memory for threads\n", progname);
		rc =  -ENOMEM;
		goto out;
	}

	all_started = 0;
	for (i = 0; i < num_threads; i++) {
		__u32 end_ost_idx;
		__u32 chunk;

		chunk = num_osts / num_threads;
		if (num_osts % num_threads)
			chunk++;
		tinfo[i].mds_hdr = mds_hdr;
		tinfo[i].mds_direntdb = mds_direntdb;
		tinfo[i].mds_sizeinfodb = mds_sizeinfodb;
		tinfo[i].status = 0;
		tinfo[i].start_ost_idx = (chunk) * i;
		end_ost_idx = (chunk) * (i + 1);
		end_ost_idx = end_ost_idx > num_osts ?
			      num_osts : end_ost_idx;
		tinfo[i].end_ost_idx = end_ost_idx;
		rc = pthread_create(&threads[i], NULL, lfsck_start_thread,
				    &tinfo[i]);
		if (rc) {
			log_write("%s: error starting thread waiting for other"
				  " threads to exit\n", progname);
			pthread_mutex_lock(&init_mutex);
			pthread_cond_broadcast(&init_cond);
			pthread_mutex_unlock(&init_mutex);
			for (--i; i >= 0; i--) {
				pthread_cancel(threads[i]);
			}
			rc = -ENOMEM;
			goto out;
		}
	}
	pthread_mutex_lock(&init_mutex);
	all_started = 1;
	pthread_cond_broadcast(&init_cond);
	pthread_mutex_unlock(&init_mutex);
	for (i = 0; i < num_threads; i++) {
		rc = pthread_join(threads[i], NULL);
		if (tinfo[i].status) {
			log_write("%s: error running thread %u\n", progname, i);
			rc = -EINVAL;
		}
	}

	rc = lfsck_run_pass4(mds_direntdb);
	if (rc != 0)
		goto out;

	rc = lfsck_run_pass5(mds_direntdb, mds_sizeinfodb);

out:
	if (threads)
		free(threads);
	if (tinfo)
		free(tinfo);
	if (mds_hdr)
		free(mds_hdr);
	if (mds_direntdb)
		mds_direntdb->close(mds_direntdb, 0);
	if (mds_hdrdb)
		mds_hdrdb->close(mds_hdrdb, 0);
	if (mds_sizeinfodb)
		mds_sizeinfodb->close(mds_sizeinfodb, 0);

	return(rc);
}

int create_lostandfound()
{
	struct stat statbuf;

	snprintf(lostandfounddir, PATH_MAX - 1, "%s/lost+found", mnt_path);
	lostandfounddir[PATH_MAX - 1] = '\0';

	VERBOSE(2, "%s: creating %s\n", progname, lostandfounddir);
	if (mkdir(lostandfounddir, 0700)) {
		if (errno != EEXIST) {
			fprintf(stderr, "%s: error creating %s: %s\n",
				progname, lostandfounddir, strerror(errno));
			return(-errno);
		}

		if (stat(lostandfounddir, &statbuf)) {
			fprintf(stderr, "%s: error stat %s: %s\n",
				progname, lostandfounddir, strerror(errno));
			return(-errno);
		}

		if (!S_ISDIR(statbuf.st_mode)) {
			fprintf(stderr, "%s: error %s is not a directory\n",
				progname, lostandfounddir);
			return(-EINVAL);
		}
	}

	snprintf(dupedir, sizeof(dupedir), "%s/duplicates", lostandfounddir);
	dupedir[PATH_MAX - 1] = '\0';

	VERBOSE(2, "%s: creating %s\n", progname, dupedir);
	if (mkdir(dupedir, 0700)) {
		if (errno != EEXIST) {
			fprintf(stderr, "%s: error creating %s: %s\n",
				progname, dupedir, strerror(errno));
			return(-errno);
		}

		if (stat(lostandfounddir, &statbuf)) {
			fprintf(stderr, "%s: error stat %s: %s\n",
				progname, dupedir, strerror(errno));
			return(-errno);
		}

		if (!S_ISDIR(statbuf.st_mode)) {
			fprintf(stderr, "%s: error %s is not a directory\n",
				progname, dupedir);
			return(-EINVAL);
		}
	}

	return(0);
}

int main(int argc, char *argv[])
{
	uid_t myuid;
	int i;

	fprintf(stderr, "lfsck %s (%s)\n", E2FSPROGS_VERSION, E2FSPROGS_DATE);

	if (parse_args(argc, argv)) {
		usage();
		exit(16);
	}

	myuid = getuid();
	if (myuid != 0 && !lfsck_force) {
		fprintf(stderr, "%s: can only be run by root user\n", progname);
		exit(16);
	}

	log_open();

	if ((lfsck_save || lfsck_delete) && create_lostandfound() != 0) {
		log_write("%s: failed to create lost+found directory\n",
			  progname);
		log_close(-1);
		exit(8);
	}

	if (get_lov_config()) {
		log_close(-1);
		exit(8);
	}

	if (lfsck_run_checks())
		log_close(-1);

	if (mds_file)
		free(mds_file);
	for (i = 0; i < LOV_MAX_OSTS; i++) {
		if (ost_files[i])
			free(ost_files[i]);
	}
	if (lfsck_duplicates)
		free(lfsck_duplicates);

	log_close(0);
	if (fix_failed) {
		fprintf(stderr, "%s: exit with %u unfixed errors\n",
			progname, fix_failed);
		return 2;
	} else {
		printf("%s: fixed %u errors\n", progname, fixed);
		return !!fixed;
	}
}
