#!/usr/bin/env python
# coding: utf-8

import cgi, os, os.path, hashlib, urllib, shutil

## Automatic metadata analyses

def strip_prefix(string, prefix):
	if string.startswith(prefix): return string[len(prefix):]
	return string

def extract(filename):
	import extractor, ctypes
	good_types = [extractor.EXTRACTOR_METAFORMAT_UTF8,
		extractor.EXTRACTOR_METAFORMAT_C_STRING]
	x = extractor.Extractor()
	keywords = x.keywordTypes()
	result = []
	def callback(_, plugin, kwtype, mdformat, mdmime, data, datalen):
		if mdformat not in good_types: return 0
		assert mdmime == 'text/plain'
		metadata_val = ctypes.cast(data, ctypes.c_char_p).value
		assert len(metadata_val) == datalen - 1
		if mdformat == extractor.EXTRACTOR_METAFORMAT_UTF8:
			metadata_val = metadata_val.decode('utf-8')
		metadata_key = keywords[kwtype]
		plugin_name, fnext = os.path.splitext(os.path.basename(plugin))
		plugin_name = strip_prefix(plugin_name, 'libextractor_')
		result.append((plugin_name, metadata_key, metadata_val))
		return 0
	x.extract(callback, None, filename)
	return result

def technical_metadata(filename):
	content = open(filename, "r").read()
	md5sum = hashlib.md5(content).hexdigest()
	sha256sum = hashlib.sha256(content).hexdigest()
	size = os.path.getsize(filename)
	return [('FILE', 'MD5', md5sum),
		('FILE', 'SHA-256', sha256sum),
		('FILE', 'size', size)]

## HTTP and HTML stuff

script_name = 'mexse.cgi'

def urlenc(mapping):
	metadata = dict((key, [val.encode('utf-8') for val in mapping[key]])
			for key in mapping)
	return urllib.urlencode(metadata, True)

def print_header(): print 'content-type: text/html; charset=utf-8\n'

def stub_method(_): print 'content-type: text/plain\n\nNot implemented yet'

def redirect(url):
	print 'Status: 302 moved\nLocation: %s\n' % url

def start_html(title):
        print '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
        "http://www.w3.org/TR/html4/loose.dtd">'''
        print '<html><head><title>%s</title>' % title
	print '<link rel=stylesheet type="text/css" href="http://sange.fi/~atehwa/mine.css">'
        print '</head>\n<body><h1>%s</h1>' % title

def end_html():
	print '</body></html>'

def inhtml(val): return cgi.escape(unicode(val).encode('utf-8'), True)

def url_of(state, metadata):
	return "%s?state=%s&%s" % (script_name, inhtml(state), urlenc(metadata))

def html_field(params, name, value):
	return '<input %s name="%s" value="%s">' % \
		(params, inhtml(name), inhtml(value))

def html_input(name, value):
	return html_field('type=text size=40', name, value)

def html_fileinput(name):
	return html_field('type=file', name, '')

def html_hidden(name, value):
	return html_field('type=hidden', name, value)

def html_submit(state, text):
	return html_hidden('state', state) + \
		html_field('type=submit', '_', text)

def html_statelink(state, text, metadata = {}):
	return '<a href="%s">%s</a>' % (url_of(state, metadata), inhtml(text))

def html_choice(name, values):
	return '<select name="%s">%s</select>' % (inhtml(name),
		'\n'.join('<option value="%s">%s</option>' %
			(inhtml(key), inhtml(val)) for key, val in values))

## Metadata elements

md_roots = ['comment', 'conformsTo', 'creator', 'issued', 'isRequiredBy',
	'medium', 'isReferencedBy', 'keyword', 'contributor', 'SHA256',
	'subject', 'generator', 'title', 'alternate', 'created', 'source',
	'location', 'spatial', 'isPartOf', 'type', 'SHA1', 'description',
	'format', 'hasPart', 'accessRights', 'language', 'extent', 'date',
	'isVersionOf', 'MD5', 'mimetype', 'publisher', 'license', 'version',
	'rightsHolder', 'project', 'funding', 'primaryTopic', 'abstract',
	'available', 'hasFormat', 'accrualMethod', 'rights', 'isFormatOf',
	'isReplacedBy', 'provenance', 'references', 'valid', 'replaces',
	'requires', 'temporal', 'popularity', 'modified', 'hasVersion',
	'discipline', 'metadata', 'identifier']

md_names = {
	'conformsTo': 'standard conformed to',
	'issued': 'date of publication',
	'isReferencedBy': 'material pointing to this',
	'alternate': 'alternate title',
	'source': 'material this originates from',
	'isPartOf': 'collection this belongs to',
	'isVersionOf': 'time series / history',
	'generator': 'software which produced this',
	'isRequiredBy': 'material which needs this',
	'spatial': 'geographical area',
	'SHA256': 'SHA-256 checksum of content',
	'subject': 'topic or subject',
	'hasVersion': 'material derived from this',
	'hasFormat': 'physical file',
	'creator': 'author or creator',
	'ref': 'reference',
	'SHA1': 'SHA-1 checksum of content',
	'accessRights': 'terms of use',
	'extent': 'size of content',
	'mimetype': 'file format in MIME',
	'charset': 'character encoding',
	'created': 'date of first version',
	'contributor': 'non-primary author or editor',
	'medium': 'medium of transmission',
	'type': 'general category of content type',
	'hasPart': 'part of this material',
	'MD5': 'MD5 checksum of content',
	'rights': 'comment about copyright',
	'popularity': 'quality assessment',
	'shortName': 'short name',
	'modified': 'date of this version',
	'trackno': 'track number',
	'affiliation': 'background organization',
	'time': 'time of day',
	'date': 'date (of what?)',
	'project': 'project that produced this',
	'primaryTopic': 'primary topic',
	'discipline': 'branch of science',
	'rightsHolder': 'owner or rights holder',
	'funding': 'funding grant',
	'funder': 'funding organization',
	'email': 'e-mail address',
	'phone': 'telephone number',
	'grantNumber': 'identifying number',
	'publisher': 'distributor or publisher',
}

def element_explanation(element):
	if element.startswith('extent.'):
		return 'size in ' + element[7:]
	if '.' not in element:
		return md_names.get(element, element + ' (of dataset)')
	return ' of '.join(reversed([md_names.get(part, part)
		for part in element.split('.')]))

def is_multielement(el):
	return element_name(el) in ('name', 'language', 'subject',
			'email', 'phone')

def element_parent(element): return '.'.join(element.split('.')[:-1])

def element_root(element): return element.split('.')[0]

def element_name(element):
	if not element: return ''
	return element.split('.')[-1]

def read_metadata(form):
	md = {}
	for key in form:
		if element_root(key) in md_roots:
			md[key] = list(set(val.decode('utf-8')
				for val in form.getlist(key)))
	return md

## Metadata heuristics

md_mappings = {
	'unknown': ('ignore', 'title', 'keyword', 'creator', 'date', 'type',
		'description', 'rights', 'identifier'),
	'mimetype': ('mimetype', 'format'),
	'embedded filename': ('ignore', 'hasPart', 'title'),
	'comment': ('description',),
	'title': ('title',),
	'book title': ('title', 'isReferencedBy.title'),
	'book edition': ('version',),
	'book chapter': ('hasPart', 'title'),
	'journal name': ('isReferencedBy.title',),
	'journal volume': ('isReferencedBy.volume',),
	'journal number': ('isReferencedBy.number',),
	'page count': ('extent.pages', 'extent'),
	'page range': ('location.pagerange',),
	'author name': ('creator.name',),
	'author email': ('creator.email',),
	'author institution': ('creator.affiliation.name', 'publisher.name'),
	'publisher': ('publisher.name', 'publisher'),
	"publisher's address": ('publisher.address', 'publisher'),
	'publishing institution': ('publisher.name',
		'publisher.affiliation.name'),
	'publication series': ('isPartOf.title', 'keyword'),
	'publication type': ('type', 'isPartOf.type'),
	'publication year': ('issued', 'issued.year'),
	'publication month': ('issued.month', 'issued'),
	'publication day': ('issued.day', 'issued'),
	'publication date': ('issued', 'modified'),
	'language': ('language',),
	'creation time': ('created.time', 'modified.time', 'created'),
	'URL': ('hasFormat.location',),
	'URI': ('hasFormat.location',),
	'international standard': ('conformsTo', 'format'),
	'MD5': ('MD5',),
	'SHA-1': ('SHA1',),
	'SHA-256': ('SHA256',),
	'GPS latitude ref': ('spatial.latitude.ref', 'spatial'),
	'GPS latitude': ('spatial.latitude', 'spatial'),
	'GPS longitude ref': ('spatial.longitude.ref', 'spatial'),
	'GPS longitude': ('spatial.longitude', 'spatial'),
	'city': ('spatial.name',),
	'sublocation': ('spatial.name',),
	'country': ('spatial.name',),
	'country code': ('spatial.identifier', 'spatial'),
	'description': ('description',),
	'copyright': ('rights', 'license', 'accessRights'),
	'keywords': ('subject',),
	'abstract': ('description',),
	'summary': ('description',),
	'subject': ('subject',),
	'creator': ('creator.name',),
	'format': ('format',),
	'format version': ('format.version',),
	'created by software': ('generator.name',),
	'unknown date': ('date', 'created', 'modified', 'issued'),
	'creation date': ('created', 'modified'),
	'modification date': ('modified', 'created'),
	'last printed': ('issued', 'date'),
	'last saved by': ('creator.name',),
	'modified by software': ('generator.name',),
	'revision history': ('hasVersion', 'description'),
	'embedded file size': ('hasPart.extent',),
	'file type': ('type', 'format'),
	'package name': ('title', 'isPartOf.title'),
	'package version': ('title.version', 'isPartOf.version'),
	'section': ('isPartOf', 'keyword'),
	'upload priority': ('keyword',),
	'dependencies': ('isRequiredBy.title',),
	'maintainer': ('creator.name',),
	'source': ('source',),
	'license': ('license.shortName', 'license', 'rights', 'accessRights'),
	'distribution': ('isPartOf', 'keyword'),
	'software version': ('title.version', 'title'),
	'resource type': ('type', 'format'),
	'camera make': ('generator.name',),
	'camera model': ('generator.model', 'generator.name'),
	'image dimensions': ('extent.dimensions', 'extent'),
	'produced by software': ('generator.name',),
	'character set': ('mimetype.charset', 'format.charset', 'format'),
	'line count': ('extent.lines', 'extent'),
	'word count': ('extent.words', 'extent'),
	'template': ('source',),
	'company': ('creator.affiliation.name',),
	'revision number': ('title.version',),
	'duration': ('extent.seconds', 'extent'),
	'album': ('isPartOf.title',),
	'artist': ('creator.name',),
	'genre': ('subject',),
	'track number': ('location.trackno',),
	'performer': ('creator.name',),
	'contact': ('publisher.address', 'creator.address'),
	'song version': ('title.version',),
	'source device': ('medium',),
	'disclaimer': ('rights', 'comment'),
	'warning': ('comment', 'rights'),
	'writer': ('creator.name',),
	'contributor': ('contributor.name',),
	'movie director': ('creator.name',),
	'song count': ('extent.parts', 'extent'),
	'conductor': ('contributor.name',),
	'composer': ('creator.name',),
	'encoded by': ('generator.name',),
	'original title': ('replaces.title',),
	'original artist': ('replaces.creator.name',),
	'original writer': ('replaces.creator.name',),
	'original release year': ('replaces.issued',),
	'original performer': ('replaces.creator.name',),
	'lyrics': ('description',),
	'subtitle': ('alternate', 'title'),
	'display type': ('type', 'comment'),
	'popularity': ('popularity',),
	'rating': ('popularity',),
	'ripper': ('generator.name',),
	'producer': ('creator.name',),
	'group': ('creator.affiliation.name',),
	'original filename': ('source.title',),
	'size': ('extent.bytes', 'extent'),
	'filename': ('title', 'comment'),
	'content-length': ('extent.bytes', 'extent'),
	'content-encoding': ('format',),
	'content-disposition': ('ignore',),
	'pragma': ('ignore',),
	'cache-control': ('ignore',),
	'set-cookie': ('ignore',),
	'last-modified': ('modified',),
	'content-type': ('mimetype',),
	'location': ('hasFormat.location',),
	'server': ('ignore',),
	'connection': ('ignore',),
	'etag': ('version.identifier', 'identifier',),
	'identifier': ('identifier',),
	'accept-ranges': ('ignore',),
	'date': ('date',),
	'vary': ('ignore',),
}

def guess_elements(key, module):
	elements = list(md_mappings.get(key, ()))
	def maybe_add(element):
		if element not in elements: elements.append(element)
	def add_wo_suffix(suffix):
		for e in elements[:]:
			if e.endswith(suffix):
				wo_suffix = e[:-len(suffix)]
				maybe_add(wo_suffix + '.identifier')
				maybe_add(wo_suffix)
	add_wo_suffix('.name')
	add_wo_suffix('.title')
	if 'title' in elements: maybe_add('alternate')
	if 'publisher' in elements: maybe_add('creator')
	if 'creator' in elements: maybe_add('contributor')
	if 'description' in elements: maybe_add('comment')
	if 'subject' in elements: maybe_add('keyword')
	if 'keyword' in elements: maybe_add('subject')
	if 'format' in elements: maybe_add('mimetype')
	if 'mimetype' in elements: maybe_add('format')
	if 'issued' in elements: maybe_add('modified')
	if 'created' in elements: maybe_add('modified')
	if 'modified' in elements: maybe_add('date')
	if 'hasFormat.location' in elements: maybe_add('identifier')
	if 'identifier' in elements: maybe_add('location')
	if 'creator' in elements: maybe_add('creator.name')
	if 'contributor' in elements: maybe_add('contributor.name')
	if 'publisher' in elements: maybe_add('publisher.name')
	maybe_add('comment')
	maybe_add('ignore')
	return [(elem, element_explanation(elem)) for elem in elements]

def get_dct(root, key, value, md):
	md_mapping = {
		'comment': 'description',
		'keyword': ('noadd', 'subject'),
		'generator': 'description',
		'location': ('noadd', 'identifier'),
		'SHA256': 'identifier',
		'SHA1': 'identifier',
		'MD5': 'identifier',
		'mimetype': ('noadd', 'format'),
		'version': 'description',
		'project': 'contributor',
		'funding': 'contributor',
		'primaryTopic': ('noadd', 'subject'),
		'popularity': 'description',
	}
	field = md_mapping.get(root, ('noadd', root))
	if field[0] == 'noadd': field = field[1]
	else: value = key + ':' + value
	if root == 'extent': value = value + ' ' + key.split('.')[1]
	scheme = ''
	if len(value) == 2 and value.isupper(): scheme = 'dct:ISO3166'
	if len(value) == 3 and value.islower(): scheme = 'dct:ISO639-3'
	if len(value) >= 10 and value[:4].isdigit() and \
		value[7] == '-' and value[4] == '-': scheme = 'dct:W3CDTF'
	if 'start=' in value and 'end=' in value: scheme = 'dct:Period'
	if 'northlimit=' in value: scheme = 'dct:Box'
	if value.startswith('http://'): scheme = 'dct:URI'
	if value.startswith('urn:'): scheme = 'dct:URI'
	if scheme: scheme = ' xsi:type="%s"' % scheme
	if key + '.language' in md:
		scheme = ' xml:lang="%s"' % md[key + '.language'][0]
	return field, scheme, value

## NRD generation

def make_nrd(metadata, expformat):
	from rdflib import Graph, Namespace, URIRef, Literal, BNode, RDF, RDFS
	ids = {}
	def find_identifier(item = ''):
		if item in ids: return ids[item]
		if item: prefix = item + '.'
		else: prefix = ''
		sugg = metadata.get(prefix + 'identifier', []) + \
			metadata.get(prefix + 'location', [])
		if item: sugg = sugg + metadata.get(item, [])
		for ident in sugg:
			if ident.startswith('http://') or \
				ident.startswith('urn:') or \
				ident.startswith('mailto:') or \
				ident.startswith('tel:'): return URIRef(ident)
		ids[item] = BNode(item + '_object')
		return ids[item]

	# ensure intermediate nodes
	for key in metadata.keys():
		if element_parent(key) not in metadata:
			metadata[element_parent(key)] = ['']

	# namespaces
	g = Graph()
	XSD = Namespace('http://www.w3.org/2001/XMLSchema#')
	DCT = Namespace('http://purl.org/dc/terms/')
	FOAF = Namespace('http://xmlns.com/foaf/0.1/')
	DCAT = Namespace('http://www.w3.org/ns/dcat#')
	SKOS = Namespace('http://www.w3.org/2004/02/skos/core#')
	NRD = Namespace('http://purl.org/net/nrd#')
	ARPFO = Namespace('http://vocab.ox.ac.uk/projectfunding#')
	ORG = Namespace('http://www.w3.org/ns/org#')
	QB = Namespace('http://purl.org/linked-data/cube#')
	FP = Namespace('http://downlode.org/Code/RDF/File_Properties/schema#')
	PROV = Namespace('http://www.w3.org/ns/prov#')
	CC = Namespace('http://creativecommons.org/ns#')
	namespaces = {'xsd': XSD, 'dct': DCT, 'foaf': FOAF, 'dcat': DCAT,
			'skos': SKOS, 'prov': PROV, 'nrd': NRD, 'cc': CC,
			'arpfo': ARPFO, 'org': ORG, 'qb': QB, 'fp': FP}
	for prefix in namespaces:
		g.namespace_manager.bind(prefix, namespaces[prefix])

	# implicit nodes
	ids['ds'] = dataset = find_identifier()
	ids['mf'] = manif = find_identifier('hasFormat')
	ids['metadata'] = mdrecord = URIRef('')
	g.add((dataset, RDF.type, NRD.Dataset))
	g.add((mdrecord, RDF.type, DCAT.CatalogRecord))
	g.add((mdrecord, FOAF.primaryTopic, dataset))
	if 'metadata.identifier' in metadata:
		g.add((mdrecord, NRD.metadataIdentifier,
			Literal(metadata['metadata.identifier'])))
	g.add((manif, RDF.type, DCAT.Distribution))
	g.add((dataset, NRD.manifestation, manif))

	# transform
	rel_mapping = {
		'comment': (RDFS.comment, RDFS.Literal, 'ds'),
		'conformsTo': (DCT.conformsTo, FOAF.Document, 'ds'),
		'alternate': (DCT.alternate, RDFS.Literal, 'ds'),
		'extent.bytes': (DCAT.byteSize, XSD.integer, 'mf'),
		'extent': (DCT.extent, RDFS.Literal, 'mf'),
		'creator': (NRD.creator, FOAF.Agent, 'ds'),
		'issued': (DCT.issued, XSD.dateTime, 'ds'),
		'abstract': (DCT.abstract, RDFS.Literal, 'ds'),
		'rightsHolder': (NRD.owner, FOAF.Agent, 'ds'),
		'isRequiredBy': (DCT.isRequiredBy, NRD.Dataset, 'ds'),
		'number': None, # TODO, maybe bibo?
		'month': 'moveup',
		'affiliation': (ORG.memberOf, ORG.Organization, None),
		'SHA1': (FP.checksum, FP.Checksum, 'mf'),
		'isReferencedBy': (NRD.usedByPublication, FOAF.Document, 'ds'),
		'year': 'moveup',
		'contributor': (NRD.contributor, FOAF.Agent, 'ds'),
		'SHA256': (FP.checksum, FP.Checksum, 'mf'),
		'subject': (NRD.subject, SKOS.Concept, 'ds'),
		'license': (DCT.license, CC.License, 'ds'),
		'shortName': (RDFS.label, RDFS.Literal, 'ds'),
		'funding': (ARPFO.funds, ARPFO.Funding, 'project'),
		'funder': (ARPFO.provides, ORG.Organization, 'funding'),
		'grantNumber': (ARPFO.grantNumber, RDFS.Literal, None),
		'project': (NRD.producerProject, FOAF.Group, 'ds'),
		'member': (FOAF.member, FOAF.Agent, None),
		'dimensions': 'moveup',
		'generator': (FP.generator, PROV.SoftwareAgent, 'ds'),
		'title': (DCT['title'], RDFS.Literal, 'ds'),
		'pagerange': 'moveup', # TODO, maybe bibo?
		'temporal': (NRD.temporal, RDFS.Literal, 'ds'),
		'charset': 'moveup',
		'primaryTopic': (FOAF.primaryTopic, FOAF.Document, 'ds'),
		'day': 'moveup',
		'isFormatOf': (DCT.isFormatOf, NRD.Dataset, 'ds'),
		'hasPart': (DCT.hasPart, DCAT.Distribution, 'mf'),
		'version': None, # TODO
		'created': (DCT.created, XSD.dateTime, 'ds'),
		'location': (DCT.identifier, RDFS.Resource, 'ds'),
		'hasFormat': None, # This is implicit
		'spatial': (NRD.spatial, RDFS.Resource, 'ds'),
		'discipline': (NRD.discipline, RDFS.Resource, 'ds'),
		'isPartOf': (DCT.isPartOf, NRD.Dataset, 'ds'),
		'type': (DCT.type, RDFS.Literal, 'ds'),
		'email': (FOAF.mbox, RDFS.Resource, None),
		'phone': (FOAF.phone, RDFS.Resource, None),
		'available': (DCT.available, XSD.dateTime, 'ds'),
		'medium': (DCT.medium, RDFS.Literal, 'mf'),
		'description': (DCT.description, RDFS.Literal, 'ds'),
		'name': (FOAF.name, RDFS.Literal, None),
		'format': (DCT['format'], RDFS.Literal, 'mf'),
		'mimetype': (DCAT.mediaType, RDFS.Literal, 'mf'),
		'seconds': 'moveup',
		'accrualMethod': (DCT.accrualMethod, RDFS.Literal, 'ds'),
		'accessRights': (NRD.rights, RDFS.Literal, 'ds'),
		'volume': None, # TODO, maybe bibo?
		'latitude': 'moveup',
		'words': 'moveup',
		'address': None, # TODO, maybe ORG or VCARD?
		'date': (DCT.date, XSD.dateTime, 'ds'),
		'isReplacedBy': (DCT.isReplacedBy, NRD.Dataset, 'ds'),
		'isVersionOf': (DCT.isVersionOf, NRD.Continuity, 'ds'),
		'pages': 'moveup',
		'MD5': (FP.checksum, FP.Checksum, 'mf'),
		'publisher': (NRD.distributor, FOAF.Agent, 'ds'),
		'parts': 'moveup',
		'replaces': (DCT.replaces, NRD.Dataset, 'ds'),
		'keyword': (DCAT.keyword, RDFS.Literal, 'ds'),
		'language': (NRD.language, RDFS.Resource, 'ds'),
		'rights': (DCT.rights, RDFS.Literal, 'ds'),
		'provenance': (DCT.provenance, RDFS.Literal, 'ds'), # TODO
		'ref': None, # Check libextractor docs for this
		'valid': (DCT.valid, XSD.dateTime, 'ds'),
		'popularity': (RDFS.comment, RDFS.Literal, 'ds'),
		'bytes': None,
		'modified': (NRD.modified, XSD.dateTime, 'ds'),
		'longitude': 'moveup',
		'hasVersion': (DCT.hasVersion, NRD.Dataset, 'ds'),
		'trackno': None, # TODO, maybe bibo?
		'source': (DCT.source, RDFS.Resource, 'ds'),
		'time': 'moveup',
		'references': (DCT.references, RDFS.Resource, 'ds'),
		'model': 'moveup',
		'requires': (DCT.requires, RDFS.Resource, 'ds'),
		'lines': 'moveup',
	}

	# turning key-value metadata into triplets
	def interpret_relation(name):
		if not name: return None, None, None, False
		components = name.split('.')
		if len(components) == 1:
			subjname = ''
			rel = name
		if '.'.join(components[-2:]) in rel_mapping:
			subjname = '.'.join(components[:-2])
			rel = '.'.join(components[-2:])
		else:
			subjname = '.'.join(components[:-1])
			rel = components[-1]
		info = rel_mapping.get(rel, None)
		if not info: return None, None, None, False
		if info == 'moveup':
			subj, pred, rng, rev = interpret_relation(subjname)
			return subj, pred, rel, rev
		pred, rng, domain = info
		rev = (domain in ('project', 'funding'))
		if subjname: subj = find_identifier(subjname)
		else: subj = find_identifier(domain)
		return subj, pred, rng, rev

	def make_literal(key, value):
		language = metadata.get(key + '.language', None)
		if not language: return Literal(value)
		else: return Literal(value, lang=language)

	def name_rel(rng):
		if rng in (FOAF.Document, NRD.Dataset, DCAT.CatalogRecord,
				DCAT.Distribution, NRD.Continuity):
			return DCT['title']
		if rng in (FOAF.Agent, ORG.Organization, FOAF.Group):
			return FOAF.name
		if rng in (ARPFO.Funding,): return ARPFO.grantNumber
		if rng in (SKOS.Concept,): return SKOS.prefLabel
		if rng in (FP.Checksum,): return FP.checksumValue
		return RDFS.label

	for key, value in ((k, v) for k in metadata for v in metadata[k]):
		s, p, rng, rev = interpret_relation(key)
		if p is None: continue # XXX: log?
		if isinstance(rng, str):
			value = rng + ': ' + value
			rng = RDFS.Literal
		if rng in (XSD.integer, XSD.dateTime):
			obj = Literal(value, datatype=rng)
		elif rng == RDFS.Literal:
			if value: obj = make_literal(key, value)
			else: continue
		else:
			obj = find_identifier(key)
			if rng != RDFS.Resource:
				g.add(((obj, s)[rev], RDF.type, rng))
			if value and URIRef(value) != obj:
				objobj = make_literal(key, value)
				g.add((obj, name_rel(rng), objobj))
		if s == obj and p == DCT.identifier: continue
		if rev: g.add((obj, p, s))
		else: g.add((s, p, obj))
		if key in ('MD5', 'SHA1', 'SHA256'):
			g.add((obj, FP.generator,
				URIRef('http://en.wikipedia.org/wiki/' + key)))

	return g.serialize(format=expformat)

## Metadata editing & improvement

handled_elements = set()

def element_help(element, values): return [] # stub

def edit_element(element, metadata):
	if element in handled_elements: return
	values = metadata.get(element, [])
	if not values:
		value = html_input('add_' + element, '')
		if is_multielement(element): value = value + \
			'<br>Separate multiple values by semicolons (;).'
	else: value = '<td><ul>%s</ul></td>' % \
		''.join('<li>%s</li>' % inhtml(val) for val in values)
	handled_elements.add(element)
	def cap(x): return x[0].upper() + x[1:]
	print '<dt>%s</dt>' % inhtml(cap(element_explanation(element)))
	other = ''.join('<p>%s</p>' + item
			for item in element_help(element, values))
	print '  <dd>%s%s</dd>' % (value, other)

## Request handlers

def ask_for_file(form):
	print_header()
	start_html('Mexse &mdash; Metadata Extraction Service')
	print '''
	<p>This service helps you to produce well-formed metadata for
	your file(s).  The service provides functionality to:</p>
	<ol>
	<li>Extract metadata from the file itself</li>
	<li>Edit and improve the metadata</li>
	<li>Download the metadata in various standard formats</li>
	</ol>
	<p>However, the service does not store the file itself nor the
	metadata.  It is up to you to download and store the
	metadata.</p>
	<p>First, please upload the file to be examined; either</p>
	<table border=0><tr><td>
	<form method=POST>
	<p>give the URL address of a file</p>
	<p>%s<br>%s</p>
	</form>
	</td><td>
	<form method=POST enctype="multipart/form-data">
	<p>or upload a file from your computer</p>
	<p>%s<br>%s</p>
	</form>
	</td></tr></table>
	<p>If you want to skip this phase, you can go directly to %s.</p>
	''' % (html_input('address', 'http://tinyurl.com/tta-header'),
			html_submit('fetch_file', 'Fetch'),
			html_fileinput('datafile'),
			html_submit('receive_file', 'Send'),
			html_statelink('edit_metadata',
				'metadata editing phase'))
	end_html()

def present_metadata(md):
	print_header()
	start_html('Mexse / Metadata Collection')
	print '''
	<p>We have gathered %d metadata items.  Now we guess what kind of
	metadata they are.  Below are some suggestions.  Please check
	that we put them into correct elements.</p>''' % len(md)
	print '''<form method=POST><table border=0>
	<tr><th>Element</th><th>Value</th><th>Origin</th></tr>'''
	n = 0
	for module, key, val in md:
		print '<tr><td>%s</td><td>%s</td><td>(%s)%s</td></tr>' % \
			(html_choice('key%d' % n, guess_elements(key, module)),
			html_input('val%d' % n, val), module +'/'+ key,
			html_hidden('items', n))
		n = n + 1
	print '''</table><p>When done, click here: %s</p></form>''' % \
			html_submit('gather_metadata', 'Looks good')
	end_html()

def fetch_file(form):
	url = form.getfirst('address')
	if not url.startswith('http://'): url = 'http://' + url
	filename, headers = urllib.urlretrieve(url)
	metadata = extract(filename) + technical_metadata(filename)
	metadata.append(('FILE', 'filename', os.path.basename(url)))
	for key in headers: metadata.append(('HTTP', key, headers[key]))
	metadata.append(('HTTP', 'location', url))
	present_metadata(metadata)

def receive_file(form):
	f = form['datafile'].file
	filename = os.tempnam()
	headers = form['datafile'].headers
	shutil.copyfileobj(f, open(filename, "wb"))
	metadata = extract(filename) + technical_metadata(filename)
	metadata.append(('HTTP', 'filename', form['datafile'].filename))
	for key in headers: metadata.append(('HTTP', key, headers[key]))
	os.unlink(filename)
	present_metadata(metadata)

def gather_metadata(form):
	md = {}
	for item in form.getlist('items'):
		key = form.getfirst('key%s' % item)
		if key == 'ignore': continue
		value = form.getfirst('val%s' % item).decode('utf-8')
		if key not in md: md[key] = []
		md[key].append(value)
	redirect(url_of('edit_metadata', md))

def edit_metadata(form):
	from datetime import datetime
	print_header()
	start_html('Mexse / Improve Metadata')
	md = read_metadata(form)
	if 'metadata.created' not in md:
		md['metadata.created'] = [unicode(datetime.today().isoformat())]
	if 'metadata.modified' not in md:
		md['metadata.modified'] = md['metadata.created']
	print '''
	<div class=marginpar><p>If you are satisfied, you can download
	the metadata as:</p>
	<ul><li>%s</li><li>%s</li><li>%s</li><li>%s</li><li>%s</li></ul>
	</div>''' % (html_statelink('output_dct', 'Flat DCMI Terms', md),
		html_statelink('output_rdf', 'RDF in well-known schemas', md),
		html_statelink('output_nrdxml', 'RDF in NRD schema (XML)', md),
		html_statelink('output_nrdn3', 'RDF in NRD schema (N3)', md),
		html_statelink('output_datacite', 'Datacite metadata', md))
	print '<form method=POST action="%s">' % url_of('change_metadata', md)
	for group in (('Basic information',
			'title', 'creator.name', 'creator.affiliation.name'),
			('Reuse information',
			'license.shortName', 'language', 'subject'),
			('Contact information',
			'publisher.name', 'publisher.email', 'publisher.phone'),
			('Tracking information',
			'identifier', 'modified', 'metadata.identifier',
			'metadata.modified'),
			('Legal information',
			'project', 'project.funding.funder',
			'project.funding.grantNumber', 'rightsHolder.name'),
			['Other information'] + md.keys()):
		print '<fieldset><legend>%s</legend>\n<dl>' % group[0]
		for element in group[1:]: edit_element(element, md)
		print '</dl>%s</fieldset>' % \
			html_submit('change_metadata', 'Make all changes')
	end_html()

def change_metadata(form):
	from datetime import datetime
	md = read_metadata(form)
	for key in form:
		if not key.startswith('add_'): continue
		value = form.getfirst(key).decode('utf-8')
		if not value: continue
		realkey = strip_prefix(key, 'add_')
		old = md.get(realkey, [])
		if is_multielement(realkey):
			vals = [val.strip() for val in value.split(';')]
		else: vals = [value]
		md[realkey] = old + vals
		md['metadata.modified'] = \
				[unicode(datetime.today().isoformat())]
	redirect(url_of('edit_metadata', md))

def output_dct(form):
	md = read_metadata(form)
	print 'content-type: text/xml; charset=utf-8\n'
	print '<?xml version="1.0" encoding="UTF-8"?>'
	print '''<foaf:Document xmlns:foaf="http://xmlns.com/foaf/0.1/"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xmlns:dct="http://purl.org/dc/terms/">'''
	for key in md:
		root = key.split('.')[0]
		if key.endswith('.language'): continue
		if 'affiliation' in key: continue
		for value in md[key]:
			field, scheme, value = get_dct(root, key, value, md)
			print '<dct:%s%s>%s</dct:%s>' % (field,
				scheme, inhtml(value), field)
	print '</foaf:Document>'

def output_nrdxml(form):
	print 'content-type: application/rdf+xml\n'
	print make_nrd(read_metadata(form), 'xml')

def output_nrdn3(form):
	print 'content-type: text/rdf+n3\n'
	print make_nrd(read_metadata(form), 'n3')

output_rdf = stub_method
output_datacite = stub_method

def test_headers(form): cgi.test()

## Script logic

def handle_request(form):
	handlers = {
		'ask_for_file': ask_for_file,
		'fetch_file': fetch_file,
		'receive_file': receive_file,
		'gather_metadata': gather_metadata,
		'edit_metadata': edit_metadata,
		'change_metadata': change_metadata,
		'output_dct': output_dct,
		'output_nrdxml': output_nrdxml,
		'output_nrdn3': output_nrdn3,
		'output_datacite': output_datacite,
		'test': test_headers,
	}
	state = form.getfirst('state', 'ask_for_file')
	try: handler = handlers[state]
	except KeyError: handler = edit_metadata
	handler(form)

if __name__ == '__main__':
	try: handle_request(cgi.FieldStorage())
	except:
		print_header()
		cgi.print_exception()

