/ .. / / -> download
#include <unistd.h>
#include <getopt.h>
#include <string.h>
#include <libxml/tree.h>
#include <libxml/xmlschemas.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>
#include "xml-utils.h"

#define PROG_NAME "xml-validate"
#define VERSION "1.1.0"

#define ERR_PREFIX PROG_NAME ": ERROR: "
#define SUCCESS_PREFIX PROG_NAME ": SUCCESS: "
#define FAILED_PREFIX PROG_NAME ": FAILED: "

#define E_BAD_LIST ERR_PREFIX "Could not read list file: %s\n"
#define E_MAX_SCHEMA_PARSERS ERR_PREFIX "Maximum number of schemas reached: %d\n"
#define E_BAD_IDREF ERR_PREFIX "%s (%ld): No matching ID for '%s'.\n"

#define EXIT_MAX_SCHEMAS 2
#define EXIT_MISSING_SCHEMA 3

#define XML_SCHEMA_URI BAD_CAST "http://www.w3.org/2001/XMLSchema"
#define XSI_URI BAD_CAST "http://www.w3.org/2001/XMLSchema-instance"

static enum verbosity_level {SILENT, NORMAL, VERBOSE} verbosity = NORMAL;

enum show_fnames { SHOW_NONE, SHOW_INVALID, SHOW_VALID };

/* Cache schemas to prevent parsing them twice (mainly needed when accessing
 * the schema over a network)
 */
struct xml_schema_parser {
	char *url;
	xmlDocPtr doc;
	xmlSchemaParserCtxtPtr ctxt;
	xmlSchemaPtr schema;
	xmlSchemaValidCtxtPtr valid_ctxt;
	xmlXPathContextPtr xpath_ctx;
	xmlXPathObjectPtr id;
	xmlXPathObjectPtr idref;
	xmlXPathObjectPtr idrefs;
};

/* Initial max schema parsers. */
static unsigned SCHEMA_PARSERS_MAX = 1;

static struct xml_schema_parser *schema_parsers;

static int schema_parser_count = 0;

static void print_error(void *userData, xmlErrorPtr error)
{
	if (error->file) {
		fprintf(userData, ERR_PREFIX "%s (%d): %s", error->file, error->line, error->message);
	} else {
		fprintf(userData, ERR_PREFIX "%s", error->message);
	}
}

static void suppress_error(void *userData, xmlErrorPtr error)
{
}

xmlStructuredErrorFunc schema_errfunc = print_error;

/* Find a schema parser by URL. */
static struct xml_schema_parser *get_schema_parser(const char *url)
{
	int i;

	for (i = 0; i < schema_parser_count; ++i) {
		if (strcmp(schema_parsers[i].url, url) == 0) {
			return &(schema_parsers[i]);
		}
	}

	return NULL;
}

/* Create a new schema parser from a URL. */
static struct xml_schema_parser *add_schema_parser(char *url)
{
	struct xml_schema_parser *parser;

	xmlDocPtr doc;
	xmlSchemaParserCtxtPtr ctxt;
	xmlSchemaPtr schema;
	xmlSchemaValidCtxtPtr valid_ctxt;
	xmlXPathContextPtr xpath_ctx;
	xmlXPathObjectPtr id, idref, idrefs;

	/* Read the schema document and create a validating context. */
	doc = read_xml_doc(url);
	ctxt = xmlSchemaNewDocParserCtxt(doc);
	schema = xmlSchemaParse(ctxt);
	valid_ctxt = xmlSchemaNewValidCtxt(schema);

	/* Set custom error functions. */
	xmlSchemaSetParserStructuredErrors(ctxt, schema_errfunc, stderr);
	xmlSchemaSetValidStructuredErrors(valid_ctxt, schema_errfunc, stderr);

	/* Locate xs:ID, xs:IDREF and xs:IDREFS types. */
	xpath_ctx = xmlXPathNewContext(doc);
	xmlXPathRegisterNs(xpath_ctx, BAD_CAST "xs", XML_SCHEMA_URI);
	id = xmlXPathEvalExpression(BAD_CAST "//xs:attribute[@type='xs:ID']|//xs:element[@type='xs:ID']", xpath_ctx);
	idref = xmlXPathEvalExpression(BAD_CAST "//xs:attribute[@type='xs:IDREF']|//xs:element[@type='xs:IDREF']", xpath_ctx);
	idrefs = xmlXPathEvalExpression(BAD_CAST "//xs:attribute[@type='xs:IDREFS']|//xs:element[@type='xs:IDREFS']", xpath_ctx);

	/* Initialize the parser. */
	schema_parsers[schema_parser_count].url = url;
	schema_parsers[schema_parser_count].doc = doc;
	schema_parsers[schema_parser_count].ctxt = ctxt;
	schema_parsers[schema_parser_count].schema = schema;
	schema_parsers[schema_parser_count].valid_ctxt = valid_ctxt;
	schema_parsers[schema_parser_count].xpath_ctx = xpath_ctx;
	schema_parsers[schema_parser_count].id = id;
	schema_parsers[schema_parser_count].idref = idref;
	schema_parsers[schema_parser_count].idrefs = idrefs;

	parser = &schema_parsers[schema_parser_count];

	++schema_parser_count;

	return parser;
}

/* Show help/usage message. */
static void show_help(void)
{
	puts("Usage: " PROG_NAME " [-s <path>] [-F|-f] [-lqvh?] [<file>...]");
	puts("");
	puts("Options:");
	puts("  -F, --valid-filenames  List valid files.");
	puts("  -f, --filenames        List invalid files.");
	puts("  -h, -?, --help         Show help/usage message.");
	puts("  -l, --list             Treat input as list of filenames.");
	puts("  -q, --quiet            Silent (no output).");
	puts("  -s, --schema <path>    Validate against the given schema.");
	puts("  -v, --verbose          Verbose output.");
	puts("  --version              Show version information.");
	puts("  <file>                 Any number of XML documents to validate.");
	LIBXML2_PARSE_LONGOPT_HELP
}

/* Show version information. */
static void show_version(void)
{
	printf("%s (xml-utils) %s\n", PROG_NAME, VERSION);
	printf("Using libxml %s\n", xmlParserVersion);
}

/* Check if a given ID attribute with a given name exists with a given ID value. */
static int check_id_exists_in_doc(const xmlDocPtr doc, const char *fname, bool attr, const xmlChar *name, const xmlChar *id)
{
	xmlXPathContextPtr ctx;
	xmlXPathObjectPtr obj;
	int err;

	ctx = xmlXPathNewContext(doc);
	xmlXPathRegisterVariable(ctx, BAD_CAST "name", xmlXPathNewString(name));
	xmlXPathRegisterVariable(ctx, BAD_CAST "id", xmlXPathNewString(id));

	if (attr) {
		obj = xmlXPathEvalExpression(BAD_CAST "//@*[name()=$name and .=$id]", ctx);
	} else {
		obj = xmlXPathEvalExpression(BAD_CAST "//*[name()=$name and .=$id]", ctx);
	}

	err = xmlXPathNodeSetIsEmpty(obj->nodesetval);

	xmlXPathFreeObject(obj);
	xmlXPathFreeContext(ctx);

	return err;
}

/* Check if a given ID exists in a document. */
static int check_id_exists(const struct xml_schema_parser *parser, const xmlDocPtr doc, const char *fname, const xmlChar *id)
{
	if (!xmlXPathNodeSetIsEmpty(parser->id->nodesetval)) {
		int i;

		for (i = 0; i < parser->id->nodesetval->nodeNr; ++i) {
			xmlNodePtr node = parser->id->nodesetval->nodeTab[i];
			xmlChar *name = xmlGetProp(node, BAD_CAST "name");
			bool matched;

			/* Determine if the given ID matched in the document. */
			matched = check_id_exists_in_doc(
				doc,
				fname,
				xmlStrcmp(node->name, BAD_CAST "attribute") == 0,
				name,
				id) == 0;

			xmlFree(name);

			/* If the IDREF matches any ID in the doc, then exit
			 * successfully. */
			if (matched) {
				return 0;
			}
		}
	}

	/* At this point, no ID in the document matched the IDREF. */
	return 1;
}

/* Check if a specific IDREF value is valid. */
static int check_specific_idref(const struct xml_schema_parser *parser, const xmlDocPtr doc, const char *fname, bool attr, const xmlChar *name)
{
	xmlXPathContextPtr ctx;
	xmlXPathObjectPtr obj;
	int err = 0;

	ctx = xmlXPathNewContext(doc);
	xmlXPathRegisterVariable(ctx, BAD_CAST "name", xmlXPathNewString(name));

	if (attr) {
		obj = xmlXPathEvalExpression(BAD_CAST "//@*[name()=$name]", ctx);
	} else {
		obj = xmlXPathEvalExpression(BAD_CAST "//*[name()=$name]", ctx);
	}

	if (!xmlXPathNodeSetIsEmpty(obj->nodesetval)) {
		int i;

		for (i = 0; i < obj->nodesetval->nodeNr; ++i) {
			int e;
			xmlNodePtr node = obj->nodesetval->nodeTab[i];
			xmlChar *id = xmlNodeGetContent(node);

			e = check_id_exists(parser, doc, fname, id);

			if (e) {
				fprintf(stderr, E_BAD_IDREF, fname, xmlGetLineNo(node), (char *) id);
			}

			err += e;

			xmlFree(id);
		}
	}

	xmlXPathFreeObject(obj);
	xmlXPathFreeContext(ctx);

	return err;
}

/* Check all IDREF values in a document. */
static int check_idref(const struct xml_schema_parser *parser, const xmlDocPtr doc, const char *fname)
{
	int err = 0;

	if (!xmlXPathNodeSetIsEmpty(parser->idref->nodesetval)) {
		int i;
		for (i = 0; i < parser->idref->nodesetval->nodeNr; ++i) {
			xmlNodePtr node = parser->idref->nodesetval->nodeTab[i];
			xmlChar *name = xmlGetProp(node, BAD_CAST "name");
			err += check_specific_idref(
				parser,
				doc,
				fname,
				xmlStrcmp(node->name, BAD_CAST "attribute") == 0,
				name);
			xmlFree(name);
		}
	}

	return err;
}

/* Check all IDREFS values in a document. */
static int check_specific_idrefs(const struct xml_schema_parser *parser, const xmlDocPtr doc, const char *fname, bool attr, const xmlChar *name)
{
	xmlXPathContextPtr ctx;
	xmlXPathObjectPtr obj;
	int err = 0;

	ctx = xmlXPathNewContext(doc);
	xmlXPathRegisterVariable(ctx, BAD_CAST "name", xmlXPathNewString(name));
	
	if (attr) {
		obj = xmlXPathEvalExpression(BAD_CAST "//@*[name()=$name]", ctx);
	} else {
		obj = xmlXPathEvalExpression(BAD_CAST "//*[name()=$name]", ctx);
	}

	if (!xmlXPathNodeSetIsEmpty(obj->nodesetval)) {
		int i;

		for (i = 0; i < obj->nodesetval->nodeNr; ++i) {
			char *ids, *id = NULL;
			xmlNodePtr node = obj->nodesetval->nodeTab[i];

			ids = (char *) xmlNodeGetContent(node);

			while ((id = strtok(id ? NULL : ids, " "))) {
				int e;

				e = check_id_exists(parser, doc, fname, BAD_CAST id);

				if (e) {
					fprintf(stderr, E_BAD_IDREF, fname, xmlGetLineNo(node), (char *) id);
				}

				err += e;
			}

			xmlFree(ids);
		}
	}

	xmlXPathFreeObject(obj);
	xmlXPathFreeContext(ctx);

	return err;
}

static int check_idrefs(const struct xml_schema_parser *parser, const xmlDocPtr doc, const char *fname)
{
	int err = 0;

	if (!xmlXPathNodeSetIsEmpty(parser->idrefs->nodesetval)) {
		int i;
		for (i = 0; i < parser->idrefs->nodesetval->nodeNr; ++i) {
			xmlNodePtr node = parser->idrefs->nodesetval->nodeTab[i];
			xmlChar *name = xmlGetProp(node, BAD_CAST "name");
			err += check_specific_idrefs(
				parser,
				doc,
				fname,
				xmlStrcmp(node->name, BAD_CAST "attribute") == 0,
				name);
			xmlFree(name);
		}
	}

	return err;
}

static void resize_schema_parsers(void)
{
	if (!(schema_parsers = realloc(schema_parsers, (SCHEMA_PARSERS_MAX *= 2) * sizeof(struct xml_schema_parser)))) {
		fprintf(stderr, E_MAX_SCHEMA_PARSERS, schema_parser_count);
		exit(EXIT_MAX_SCHEMAS);
	}
}

static int validate_file(const char *fname, const char *schema, int list)
{
	xmlDocPtr doc;
	xmlNodePtr root;
	char *url;
	struct xml_schema_parser *parser;
	int err = 0;

	if (!(doc = read_xml_doc(fname))) {
		return 1;
	}

	root = xmlDocGetRootElement(doc);

	if (schema) {
		url = strdup(schema);
	} else {
		url = (char *) xmlGetNsProp(root, BAD_CAST "noNamespaceSchemaLocation", XSI_URI);
	}

	if (!url) {
		if (verbosity > SILENT) {
			fprintf(stderr, ERR_PREFIX "%s has no schema.\n", fname);
		}
		return 1;
	}

	if ((parser = get_schema_parser(url))) {
		xmlFree(url);
	} else {
		if (schema_parser_count == SCHEMA_PARSERS_MAX) {
			resize_schema_parsers();
		}

		parser = add_schema_parser(url);
	}

	/* libxml2's XML Schema validator currently does not check ID and
	 * IDREF/IDREFS relationships, so these have been implemented
	 * separately. */
	err += check_idref(parser, doc, fname);
	err += check_idrefs(parser, doc, fname);

	if (xmlSchemaValidateDoc(parser->valid_ctxt, doc)) {
		++err;
	}

	if (verbosity >= VERBOSE) {
		if (err) {
			fprintf(stderr, FAILED_PREFIX "%s fails to validate against schema %s\n", fname, parser->url);
		} else {
			fprintf(stderr, SUCCESS_PREFIX "%s validates against schema %s\n", fname, parser->url);
		}
	}

	if (list && err) {
		printf("%s\n", fname);
	}

	xmlFreeDoc(doc);

	return err;
}

static int validate_file_list(const char *fname, const char *schema, enum show_fnames show_fnames)
{
	FILE *f;
	char path[PATH_MAX];
	int err;

	if (fname) {
		if (!(f = fopen(fname, "r"))) {
			fprintf(stderr, E_BAD_LIST, fname);
			return 0;
		}
	} else {
		f = stdin;
	}

	err = 0;

	while (fgets(path, PATH_MAX, f)) {
		strtok(path, "\t\r\n");
		err += validate_file(path, schema, show_fnames);
	}

	if (fname) {
		fclose(f);
	}

	return err;
}

int main(int argc, char *argv[])
{
	int c, i;
	int err = 0;
	enum show_fnames show_fnames = SHOW_NONE;
	int is_list = 0;
	char *schema = NULL;

	const char *sopts = "flqvs:h?";
	struct option lopts[] = {
		{"version"       , no_argument      , 0, 0},
		{"help"          , no_argument      , 0, 'h'},
		{"filenames"     , no_argument      , 0, 'f'},
		{"list"          , no_argument      , 0, 'l'},
		{"quiet"         , no_argument      , 0, 'q'},
		{"verbose"       , no_argument      , 0, 'v'},
		{"schema"        , required_argument, 0, 's'},
		LIBXML2_PARSE_LONGOPT_DEFS
		{0, 0, 0, 0}
	};
	int loptind = 0;

	schema_parsers = malloc(SCHEMA_PARSERS_MAX * sizeof(struct xml_schema_parser));

	while ((c = getopt_long(argc, argv, sopts, lopts, &loptind)) != -1) {
		switch (c) {
			case 0:
				if (strcmp(lopts[loptind].name, "version") == 0) {
					show_version();
					return EXIT_SUCCESS;
				}
				LIBXML2_PARSE_LONGOPT_HANDLE(lopts, loptind, optarg)
				break;
			case 'F': show_fnames = SHOW_VALID; break;
			case 'f': show_fnames = SHOW_INVALID; break;
			case 'l': is_list = 1; break;
			case 'q': verbosity = SILENT; break;
			case 'v': verbosity = VERBOSE; break;
			case 's': schema = strdup(optarg); break;
			case 'h': 
			case '?': show_help(); return EXIT_SUCCESS;
		}
	}

	LIBXML2_PARSE_INIT

	if (verbosity == SILENT) {
		schema_errfunc = suppress_error;
	}

	xmlSetStructuredErrorFunc(stderr, schema_errfunc);

	if (optind < argc) {
		for (i = optind; i < argc; ++i) {
			if (is_list) {
				err += validate_file_list(argv[i], schema, show_fnames);
			} else {
				err += validate_file(argv[i], schema, show_fnames);
			}
		}
	} else if (is_list) {
		err = validate_file_list(NULL, schema, show_fnames);
	} else {
		err = validate_file("-", schema, show_fnames);
	}

	for (i = 0; i < schema_parser_count; ++i) {
		xmlFree(schema_parsers[i].url);
		xmlSchemaFreeValidCtxt(schema_parsers[i].valid_ctxt);
		xmlSchemaFree(schema_parsers[i].schema);
		xmlSchemaFreeParserCtxt(schema_parsers[i].ctxt);
		xmlXPathFreeObject(schema_parsers[i].id);
		xmlXPathFreeObject(schema_parsers[i].idref);
		xmlXPathFreeObject(schema_parsers[i].idrefs);
		xmlXPathFreeContext(schema_parsers[i].xpath_ctx);
		xmlFreeDoc(schema_parsers[i].doc);
	}

	free(schema_parsers);
	free(schema);
	xmlCleanupParser();

	return err ? EXIT_FAILURE : EXIT_SUCCESS;
}


/ gopher://khzae.net/0/s1kd/xml/xml-utils/src/utils/xml-validate/xml-validate.c
Styles: Light Dark Classic