#!/usr/bin/env python # coding: utf-8 import cgi, os, os.path, hashlib, urllib, shutil ## Automatic metadata analyses def strip_prefix(string, prefix): if string.startswith(prefix): return string[len(prefix):] return string def extract(filename): import extractor, ctypes good_types = [extractor.EXTRACTOR_METAFORMAT_UTF8, extractor.EXTRACTOR_METAFORMAT_C_STRING] x = extractor.Extractor() keywords = x.keywordTypes() result = [] def callback(_, plugin, kwtype, mdformat, mdmime, data, datalen): if mdformat not in good_types: return 0 assert mdmime == 'text/plain' metadata_val = ctypes.cast(data, ctypes.c_char_p).value assert len(metadata_val) == datalen - 1 if mdformat == extractor.EXTRACTOR_METAFORMAT_UTF8: metadata_val = metadata_val.decode('utf-8') metadata_key = keywords[kwtype] plugin_name, fnext = os.path.splitext(os.path.basename(plugin)) plugin_name = strip_prefix(plugin_name, 'libextractor_') result.append((plugin_name, metadata_key, metadata_val)) return 0 x.extract(callback, None, filename) return result def technical_metadata(filename): content = open(filename, "r").read() md5sum = hashlib.md5(content).hexdigest() sha256sum = hashlib.sha256(content).hexdigest() size = os.path.getsize(filename) return [('FILE', 'MD5', md5sum), ('FILE', 'SHA-256', sha256sum), ('FILE', 'size', size)] ## HTTP and HTML stuff script_name = 'mexse.cgi' def urlenc(mapping): metadata = dict((key, [val.encode('utf-8') for val in mapping[key]]) for key in mapping) return urllib.urlencode(metadata, True) def print_header(): print 'content-type: text/html; charset=utf-8\n' def stub_method(_): print 'content-type: text/plain\n\nNot implemented yet' def redirect(url): print 'Status: 302 moved\nLocation: %s\n' % url def start_html(title): print '''''' print '%s' % title print '' print '\n

%s

' % title def end_html(): print '' def inhtml(val): return cgi.escape(unicode(val).encode('utf-8'), True) def url_of(state, metadata): return "%s?state=%s&%s" % (script_name, inhtml(state), urlenc(metadata)) def html_field(params, name, value): return '' % \ (params, inhtml(name), inhtml(value)) def html_input(name, value): return html_field('type=text size=40', name, value) def html_fileinput(name): return html_field('type=file', name, '') def html_hidden(name, value): return html_field('type=hidden', name, value) def html_submit(state, text): return html_hidden('state', state) + \ html_field('type=submit', '_', text) def html_statelink(state, text, metadata = {}): return '%s' % (url_of(state, metadata), inhtml(text)) def html_choice(name, values): return '' % (inhtml(name), '\n'.join('' % (inhtml(key), inhtml(val)) for key, val in values)) ## Metadata elements md_roots = ['comment', 'conformsTo', 'creator', 'issued', 'isRequiredBy', 'medium', 'isReferencedBy', 'keyword', 'contributor', 'SHA256', 'subject', 'generator', 'title', 'alternate', 'created', 'source', 'location', 'spatial', 'isPartOf', 'type', 'SHA1', 'description', 'format', 'hasPart', 'accessRights', 'language', 'extent', 'date', 'isVersionOf', 'MD5', 'mimetype', 'publisher', 'license', 'version', 'rightsHolder', 'project', 'funding', 'primaryTopic', 'abstract', 'available', 'hasFormat', 'accrualMethod', 'rights', 'isFormatOf', 'isReplacedBy', 'provenance', 'references', 'valid', 'replaces', 'requires', 'temporal', 'popularity', 'modified', 'hasVersion', 'discipline', 'metadata', 'identifier'] md_names = { 'conformsTo': 'standard conformed to', 'issued': 'date of publication', 'isReferencedBy': 'material pointing to this', 'alternate': 'alternate title', 'source': 'material this originates from', 'isPartOf': 'collection this belongs to', 'isVersionOf': 'time series / history', 'generator': 'software which produced this', 'isRequiredBy': 'material which needs this', 'spatial': 'geographical area', 'SHA256': 'SHA-256 checksum of content', 'subject': 'topic or subject', 'hasVersion': 'material derived from this', 'hasFormat': 'physical file', 'creator': 'author or creator', 'ref': 'reference', 'SHA1': 'SHA-1 checksum of content', 'accessRights': 'terms of use', 'extent': 'size of content', 'mimetype': 'file format in MIME', 'charset': 'character encoding', 'created': 'date of first version', 'contributor': 'non-primary author or editor', 'medium': 'medium of transmission', 'type': 'general category of content type', 'hasPart': 'part of this material', 'MD5': 'MD5 checksum of content', 'rights': 'comment about copyright', 'popularity': 'quality assessment', 'shortName': 'short name', 'modified': 'date of this version', 'trackno': 'track number', 'affiliation': 'background organization', 'time': 'time of day', 'date': 'date (of what?)', 'project': 'project that produced this', 'primaryTopic': 'primary topic', 'discipline': 'branch of science', 'rightsHolder': 'owner or rights holder', 'funding': 'funding grant', 'funder': 'funding organization', 'email': 'e-mail address', 'phone': 'telephone number', 'grantNumber': 'identifying number', 'publisher': 'distributor or publisher', } def element_explanation(element): if element.startswith('extent.'): return 'size in ' + element[7:] if '.' not in element: return md_names.get(element, element + ' (of dataset)') return ' of '.join(reversed([md_names.get(part, part) for part in element.split('.')])) def is_multielement(el): return element_name(el) in ('name', 'language', 'subject', 'email', 'phone') def element_parent(element): return '.'.join(element.split('.')[:-1]) def element_root(element): return element.split('.')[0] def element_name(element): if not element: return '' return element.split('.')[-1] def read_metadata(form): md = {} for key in form: if element_root(key) in md_roots: md[key] = list(set(val.decode('utf-8') for val in form.getlist(key))) return md ## Metadata heuristics md_mappings = { 'unknown': ('ignore', 'title', 'keyword', 'creator', 'date', 'type', 'description', 'rights', 'identifier'), 'mimetype': ('mimetype', 'format'), 'embedded filename': ('ignore', 'hasPart', 'title'), 'comment': ('description',), 'title': ('title',), 'book title': ('title', 'isReferencedBy.title'), 'book edition': ('version',), 'book chapter': ('hasPart', 'title'), 'journal name': ('isReferencedBy.title',), 'journal volume': ('isReferencedBy.volume',), 'journal number': ('isReferencedBy.number',), 'page count': ('extent.pages', 'extent'), 'page range': ('location.pagerange',), 'author name': ('creator.name',), 'author email': ('creator.email',), 'author institution': ('creator.affiliation.name', 'publisher.name'), 'publisher': ('publisher.name', 'publisher'), "publisher's address": ('publisher.address', 'publisher'), 'publishing institution': ('publisher.name', 'publisher.affiliation.name'), 'publication series': ('isPartOf.title', 'keyword'), 'publication type': ('type', 'isPartOf.type'), 'publication year': ('issued', 'issued.year'), 'publication month': ('issued.month', 'issued'), 'publication day': ('issued.day', 'issued'), 'publication date': ('issued', 'modified'), 'language': ('language',), 'creation time': ('created.time', 'modified.time', 'created'), 'URL': ('hasFormat.location',), 'URI': ('hasFormat.location',), 'international standard': ('conformsTo', 'format'), 'MD5': ('MD5',), 'SHA-1': ('SHA1',), 'SHA-256': ('SHA256',), 'GPS latitude ref': ('spatial.latitude.ref', 'spatial'), 'GPS latitude': ('spatial.latitude', 'spatial'), 'GPS longitude ref': ('spatial.longitude.ref', 'spatial'), 'GPS longitude': ('spatial.longitude', 'spatial'), 'city': ('spatial.name',), 'sublocation': ('spatial.name',), 'country': ('spatial.name',), 'country code': ('spatial.identifier', 'spatial'), 'description': ('description',), 'copyright': ('rights', 'license', 'accessRights'), 'keywords': ('subject',), 'abstract': ('description',), 'summary': ('description',), 'subject': ('subject',), 'creator': ('creator.name',), 'format': ('format',), 'format version': ('format.version',), 'created by software': ('generator.name',), 'unknown date': ('date', 'created', 'modified', 'issued'), 'creation date': ('created', 'modified'), 'modification date': ('modified', 'created'), 'last printed': ('issued', 'date'), 'last saved by': ('creator.name',), 'modified by software': ('generator.name',), 'revision history': ('hasVersion', 'description'), 'embedded file size': ('hasPart.extent',), 'file type': ('type', 'format'), 'package name': ('title', 'isPartOf.title'), 'package version': ('title.version', 'isPartOf.version'), 'section': ('isPartOf', 'keyword'), 'upload priority': ('keyword',), 'dependencies': ('isRequiredBy.title',), 'maintainer': ('creator.name',), 'source': ('source',), 'license': ('license.shortName', 'license', 'rights', 'accessRights'), 'distribution': ('isPartOf', 'keyword'), 'software version': ('title.version', 'title'), 'resource type': ('type', 'format'), 'camera make': ('generator.name',), 'camera model': ('generator.model', 'generator.name'), 'image dimensions': ('extent.dimensions', 'extent'), 'produced by software': ('generator.name',), 'character set': ('mimetype.charset', 'format.charset', 'format'), 'line count': ('extent.lines', 'extent'), 'word count': ('extent.words', 'extent'), 'template': ('source',), 'company': ('creator.affiliation.name',), 'revision number': ('title.version',), 'duration': ('extent.seconds', 'extent'), 'album': ('isPartOf.title',), 'artist': ('creator.name',), 'genre': ('subject',), 'track number': ('location.trackno',), 'performer': ('creator.name',), 'contact': ('publisher.address', 'creator.address'), 'song version': ('title.version',), 'source device': ('medium',), 'disclaimer': ('rights', 'comment'), 'warning': ('comment', 'rights'), 'writer': ('creator.name',), 'contributor': ('contributor.name',), 'movie director': ('creator.name',), 'song count': ('extent.parts', 'extent'), 'conductor': ('contributor.name',), 'composer': ('creator.name',), 'encoded by': ('generator.name',), 'original title': ('replaces.title',), 'original artist': ('replaces.creator.name',), 'original writer': ('replaces.creator.name',), 'original release year': ('replaces.issued',), 'original performer': ('replaces.creator.name',), 'lyrics': ('description',), 'subtitle': ('alternate', 'title'), 'display type': ('type', 'comment'), 'popularity': ('popularity',), 'rating': ('popularity',), 'ripper': ('generator.name',), 'producer': ('creator.name',), 'group': ('creator.affiliation.name',), 'original filename': ('source.title',), 'size': ('extent.bytes', 'extent'), 'filename': ('title', 'comment'), 'content-length': ('extent.bytes', 'extent'), 'content-encoding': ('format',), 'content-disposition': ('ignore',), 'pragma': ('ignore',), 'cache-control': ('ignore',), 'set-cookie': ('ignore',), 'last-modified': ('modified',), 'content-type': ('mimetype',), 'location': ('hasFormat.location',), 'server': ('ignore',), 'connection': ('ignore',), 'etag': ('version.identifier', 'identifier',), 'identifier': ('identifier',), 'accept-ranges': ('ignore',), 'date': ('date',), 'vary': ('ignore',), } def guess_elements(key, module): elements = list(md_mappings.get(key, ())) def maybe_add(element): if element not in elements: elements.append(element) def add_wo_suffix(suffix): for e in elements[:]: if e.endswith(suffix): wo_suffix = e[:-len(suffix)] maybe_add(wo_suffix + '.identifier') maybe_add(wo_suffix) add_wo_suffix('.name') add_wo_suffix('.title') if 'title' in elements: maybe_add('alternate') if 'publisher' in elements: maybe_add('creator') if 'creator' in elements: maybe_add('contributor') if 'description' in elements: maybe_add('comment') if 'subject' in elements: maybe_add('keyword') if 'keyword' in elements: maybe_add('subject') if 'format' in elements: maybe_add('mimetype') if 'mimetype' in elements: maybe_add('format') if 'issued' in elements: maybe_add('modified') if 'created' in elements: maybe_add('modified') if 'modified' in elements: maybe_add('date') if 'hasFormat.location' in elements: maybe_add('identifier') if 'identifier' in elements: maybe_add('location') if 'creator' in elements: maybe_add('creator.name') if 'contributor' in elements: maybe_add('contributor.name') if 'publisher' in elements: maybe_add('publisher.name') maybe_add('comment') maybe_add('ignore') return [(elem, element_explanation(elem)) for elem in elements] def get_dct(root, key, value, md): md_mapping = { 'comment': 'description', 'keyword': ('noadd', 'subject'), 'generator': 'description', 'location': ('noadd', 'identifier'), 'SHA256': 'identifier', 'SHA1': 'identifier', 'MD5': 'identifier', 'mimetype': ('noadd', 'format'), 'version': 'description', 'project': 'contributor', 'funding': 'contributor', 'primaryTopic': ('noadd', 'subject'), 'popularity': 'description', } field = md_mapping.get(root, ('noadd', root)) if field[0] == 'noadd': field = field[1] else: value = key + ':' + value if root == 'extent': value = value + ' ' + key.split('.')[1] scheme = '' if len(value) == 2 and value.isupper(): scheme = 'dct:ISO3166' if len(value) == 3 and value.islower(): scheme = 'dct:ISO639-3' if len(value) >= 10 and value[:4].isdigit() and \ value[7] == '-' and value[4] == '-': scheme = 'dct:W3CDTF' if 'start=' in value and 'end=' in value: scheme = 'dct:Period' if 'northlimit=' in value: scheme = 'dct:Box' if value.startswith('http://'): scheme = 'dct:URI' if value.startswith('urn:'): scheme = 'dct:URI' if scheme: scheme = ' xsi:type="%s"' % scheme if key + '.language' in md: scheme = ' xml:lang="%s"' % md[key + '.language'][0] return field, scheme, value ## NRD generation def make_nrd(metadata, expformat): from rdflib import Graph, Namespace, URIRef, Literal, BNode, RDF, RDFS ids = {} def find_identifier(item = ''): if item in ids: return ids[item] if item: prefix = item + '.' else: prefix = '' sugg = metadata.get(prefix + 'identifier', []) + \ metadata.get(prefix + 'location', []) if item: sugg = sugg + metadata.get(item, []) for ident in sugg: if ident.startswith('http://') or \ ident.startswith('urn:') or \ ident.startswith('mailto:') or \ ident.startswith('tel:'): return URIRef(ident) ids[item] = BNode(item + '_object') return ids[item] # ensure intermediate nodes for key in metadata.keys(): if element_parent(key) not in metadata: metadata[element_parent(key)] = [''] # namespaces g = Graph() XSD = Namespace('http://www.w3.org/2001/XMLSchema#') DCT = Namespace('http://purl.org/dc/terms/') FOAF = Namespace('http://xmlns.com/foaf/0.1/') DCAT = Namespace('http://www.w3.org/ns/dcat#') SKOS = Namespace('http://www.w3.org/2004/02/skos/core#') NRD = Namespace('http://purl.org/net/nrd#') ARPFO = Namespace('http://vocab.ox.ac.uk/projectfunding#') ORG = Namespace('http://www.w3.org/ns/org#') QB = Namespace('http://purl.org/linked-data/cube#') FP = Namespace('http://downlode.org/Code/RDF/File_Properties/schema#') PROV = Namespace('http://www.w3.org/ns/prov#') CC = Namespace('http://creativecommons.org/ns#') namespaces = {'xsd': XSD, 'dct': DCT, 'foaf': FOAF, 'dcat': DCAT, 'skos': SKOS, 'prov': PROV, 'nrd': NRD, 'cc': CC, 'arpfo': ARPFO, 'org': ORG, 'qb': QB, 'fp': FP} for prefix in namespaces: g.namespace_manager.bind(prefix, namespaces[prefix]) # implicit nodes ids['ds'] = dataset = find_identifier() ids['mf'] = manif = find_identifier('hasFormat') ids['metadata'] = mdrecord = URIRef('') g.add((dataset, RDF.type, NRD.Dataset)) g.add((mdrecord, RDF.type, DCAT.CatalogRecord)) g.add((mdrecord, FOAF.primaryTopic, dataset)) if 'metadata.identifier' in metadata: g.add((mdrecord, NRD.metadataIdentifier, Literal(metadata['metadata.identifier']))) g.add((manif, RDF.type, DCAT.Distribution)) g.add((dataset, NRD.manifestation, manif)) # transform rel_mapping = { 'comment': (RDFS.comment, RDFS.Literal, 'ds'), 'conformsTo': (DCT.conformsTo, FOAF.Document, 'ds'), 'alternate': (DCT.alternate, RDFS.Literal, 'ds'), 'extent.bytes': (DCAT.byteSize, XSD.integer, 'mf'), 'extent': (DCT.extent, RDFS.Literal, 'mf'), 'creator': (NRD.creator, FOAF.Agent, 'ds'), 'issued': (DCT.issued, XSD.dateTime, 'ds'), 'abstract': (DCT.abstract, RDFS.Literal, 'ds'), 'rightsHolder': (NRD.owner, FOAF.Agent, 'ds'), 'isRequiredBy': (DCT.isRequiredBy, NRD.Dataset, 'ds'), 'number': None, # TODO, maybe bibo? 'month': 'moveup', 'affiliation': (ORG.memberOf, ORG.Organization, None), 'SHA1': (FP.checksum, FP.Checksum, 'mf'), 'isReferencedBy': (NRD.usedByPublication, FOAF.Document, 'ds'), 'year': 'moveup', 'contributor': (NRD.contributor, FOAF.Agent, 'ds'), 'SHA256': (FP.checksum, FP.Checksum, 'mf'), 'subject': (NRD.subject, SKOS.Concept, 'ds'), 'license': (DCT.license, CC.License, 'ds'), 'shortName': (RDFS.label, RDFS.Literal, 'ds'), 'funding': (ARPFO.funds, ARPFO.Funding, 'project'), 'funder': (ARPFO.provides, ORG.Organization, 'funding'), 'grantNumber': (ARPFO.grantNumber, RDFS.Literal, None), 'project': (NRD.producerProject, FOAF.Group, 'ds'), 'member': (FOAF.member, FOAF.Agent, None), 'dimensions': 'moveup', 'generator': (FP.generator, PROV.SoftwareAgent, 'ds'), 'title': (DCT['title'], RDFS.Literal, 'ds'), 'pagerange': 'moveup', # TODO, maybe bibo? 'temporal': (NRD.temporal, RDFS.Literal, 'ds'), 'charset': 'moveup', 'primaryTopic': (FOAF.primaryTopic, FOAF.Document, 'ds'), 'day': 'moveup', 'isFormatOf': (DCT.isFormatOf, NRD.Dataset, 'ds'), 'hasPart': (DCT.hasPart, DCAT.Distribution, 'mf'), 'version': None, # TODO 'created': (DCT.created, XSD.dateTime, 'ds'), 'location': (DCT.identifier, RDFS.Resource, 'ds'), 'hasFormat': None, # This is implicit 'spatial': (NRD.spatial, RDFS.Resource, 'ds'), 'discipline': (NRD.discipline, RDFS.Resource, 'ds'), 'isPartOf': (DCT.isPartOf, NRD.Dataset, 'ds'), 'type': (DCT.type, RDFS.Literal, 'ds'), 'email': (FOAF.mbox, RDFS.Resource, None), 'phone': (FOAF.phone, RDFS.Resource, None), 'available': (DCT.available, XSD.dateTime, 'ds'), 'medium': (DCT.medium, RDFS.Literal, 'mf'), 'description': (DCT.description, RDFS.Literal, 'ds'), 'name': (FOAF.name, RDFS.Literal, None), 'format': (DCT['format'], RDFS.Literal, 'mf'), 'mimetype': (DCAT.mediaType, RDFS.Literal, 'mf'), 'seconds': 'moveup', 'accrualMethod': (DCT.accrualMethod, RDFS.Literal, 'ds'), 'accessRights': (NRD.rights, RDFS.Literal, 'ds'), 'volume': None, # TODO, maybe bibo? 'latitude': 'moveup', 'words': 'moveup', 'address': None, # TODO, maybe ORG or VCARD? 'date': (DCT.date, XSD.dateTime, 'ds'), 'isReplacedBy': (DCT.isReplacedBy, NRD.Dataset, 'ds'), 'isVersionOf': (DCT.isVersionOf, NRD.Continuity, 'ds'), 'pages': 'moveup', 'MD5': (FP.checksum, FP.Checksum, 'mf'), 'publisher': (NRD.distributor, FOAF.Agent, 'ds'), 'parts': 'moveup', 'replaces': (DCT.replaces, NRD.Dataset, 'ds'), 'keyword': (DCAT.keyword, RDFS.Literal, 'ds'), 'language': (NRD.language, RDFS.Resource, 'ds'), 'rights': (DCT.rights, RDFS.Literal, 'ds'), 'provenance': (DCT.provenance, RDFS.Literal, 'ds'), # TODO 'ref': None, # Check libextractor docs for this 'valid': (DCT.valid, XSD.dateTime, 'ds'), 'popularity': (RDFS.comment, RDFS.Literal, 'ds'), 'bytes': None, 'modified': (NRD.modified, XSD.dateTime, 'ds'), 'longitude': 'moveup', 'hasVersion': (DCT.hasVersion, NRD.Dataset, 'ds'), 'trackno': None, # TODO, maybe bibo? 'source': (DCT.source, RDFS.Resource, 'ds'), 'time': 'moveup', 'references': (DCT.references, RDFS.Resource, 'ds'), 'model': 'moveup', 'requires': (DCT.requires, RDFS.Resource, 'ds'), 'lines': 'moveup', } # turning key-value metadata into triplets def interpret_relation(name): if not name: return None, None, None, False components = name.split('.') if len(components) == 1: subjname = '' rel = name if '.'.join(components[-2:]) in rel_mapping: subjname = '.'.join(components[:-2]) rel = '.'.join(components[-2:]) else: subjname = '.'.join(components[:-1]) rel = components[-1] info = rel_mapping.get(rel, None) if not info: return None, None, None, False if info == 'moveup': subj, pred, rng, rev = interpret_relation(subjname) return subj, pred, rel, rev pred, rng, domain = info rev = (domain in ('project', 'funding')) if subjname: subj = find_identifier(subjname) else: subj = find_identifier(domain) return subj, pred, rng, rev def make_literal(key, value): language = metadata.get(key + '.language', None) if not language: return Literal(value) else: return Literal(value, lang=language) def name_rel(rng): if rng in (FOAF.Document, NRD.Dataset, DCAT.CatalogRecord, DCAT.Distribution, NRD.Continuity): return DCT['title'] if rng in (FOAF.Agent, ORG.Organization, FOAF.Group): return FOAF.name if rng in (ARPFO.Funding,): return ARPFO.grantNumber if rng in (SKOS.Concept,): return SKOS.prefLabel if rng in (FP.Checksum,): return FP.checksumValue return RDFS.label for key, value in ((k, v) for k in metadata for v in metadata[k]): s, p, rng, rev = interpret_relation(key) if p is None: continue # XXX: log? if isinstance(rng, str): value = rng + ': ' + value rng = RDFS.Literal if rng in (XSD.integer, XSD.dateTime): obj = Literal(value, datatype=rng) elif rng == RDFS.Literal: if value: obj = make_literal(key, value) else: continue else: obj = find_identifier(key) if rng != RDFS.Resource: g.add(((obj, s)[rev], RDF.type, rng)) if value and URIRef(value) != obj: objobj = make_literal(key, value) g.add((obj, name_rel(rng), objobj)) if s == obj and p == DCT.identifier: continue if rev: g.add((obj, p, s)) else: g.add((s, p, obj)) if key in ('MD5', 'SHA1', 'SHA256'): g.add((obj, FP.generator, URIRef('http://en.wikipedia.org/wiki/' + key))) return g.serialize(format=expformat) ## Metadata editing & improvement handled_elements = set() def element_help(element, values): return [] # stub def edit_element(element, metadata): if element in handled_elements: return values = metadata.get(element, []) if not values: value = html_input('add_' + element, '') if is_multielement(element): value = value + \ '
Separate multiple values by semicolons (;).' else: value = '

%s' % \ ''.join('

' % inhtml(val) for val in values) handled_elements.add(element) def cap(x): return x[0].upper() + x[1:] print '

' % inhtml(cap(element_explanation(element))) other = ''.join('

' + item for item in element_help(element, values)) print '

%s%s

' % (value, other) ## Request handlers def ask_for_file(form): print_header() start_html('Mexse — Metadata Extraction Service') print '''

This service helps you to produce well-formed metadata for your file(s). The service provides functionality to:

Extract metadata from the file itself
Edit and improve the metadata
Download the metadata in various standard formats

However, the service does not store the file itself nor the metadata. It is up to you to download and store the metadata.

First, please upload the file to be examined; either

If you want to skip this phase, you can go directly to %s.

''' % (html_input('address', 'http://tinyurl.com/tta-header'), html_submit('fetch_file', 'Fetch'), html_fileinput('datafile'), html_submit('receive_file', 'Send'), html_statelink('edit_metadata', 'metadata editing phase')) end_html() def present_metadata(md): print_header() start_html('Mexse / Metadata Collection') print '''

We have gathered %d metadata items. Now we guess what kind of metadata they are. Below are some suggestions. Please check that we put them into correct elements.

''' % len(md) print '''

''' n = 0 for module, key, val in md: print '' % \ (html_choice('key%d' % n, guess_elements(key, module)), html_input('val%d' % n, val), module +'/'+ key, html_hidden('items', n)) n = n + 1 print '''

Element	Value	Origin
%s	%s	(%s)%s

When done, click here: %s

''' % \ html_submit('gather_metadata', 'Looks good') end_html() def fetch_file(form): url = form.getfirst('address') if not url.startswith('http://'): url = 'http://' + url filename, headers = urllib.urlretrieve(url) metadata = extract(filename) + technical_metadata(filename) metadata.append(('FILE', 'filename', os.path.basename(url))) for key in headers: metadata.append(('HTTP', key, headers[key])) metadata.append(('HTTP', 'location', url)) present_metadata(metadata) def receive_file(form): f = form['datafile'].file filename = os.tempnam() headers = form['datafile'].headers shutil.copyfileobj(f, open(filename, "wb")) metadata = extract(filename) + technical_metadata(filename) metadata.append(('HTTP', 'filename', form['datafile'].filename)) for key in headers: metadata.append(('HTTP', key, headers[key])) os.unlink(filename) present_metadata(metadata) def gather_metadata(form): md = {} for item in form.getlist('items'): key = form.getfirst('key%s' % item) if key == 'ignore': continue value = form.getfirst('val%s' % item).decode('utf-8') if key not in md: md[key] = [] md[key].append(value) redirect(url_of('edit_metadata', md)) def edit_metadata(form): from datetime import datetime print_header() start_html('Mexse / Improve Metadata') md = read_metadata(form) if 'metadata.created' not in md: md['metadata.created'] = [unicode(datetime.today().isoformat())] if 'metadata.modified' not in md: md['metadata.modified'] = md['metadata.created'] print '''

If you are satisfied, you can download the metadata as:

''' % (html_statelink('output_dct', 'Flat DCMI Terms', md), html_statelink('output_rdf', 'RDF in well-known schemas', md), html_statelink('output_nrdxml', 'RDF in NRD schema (XML)', md), html_statelink('output_nrdn3', 'RDF in NRD schema (N3)', md), html_statelink('output_datacite', 'Datacite metadata', md)) print '

' % url_of('change_metadata', md) for group in (('Basic information', 'title', 'creator.name', 'creator.affiliation.name'), ('Reuse information', 'license.shortName', 'language', 'subject'), ('Contact information', 'publisher.name', 'publisher.email', 'publisher.phone'), ('Tracking information', 'identifier', 'modified', 'metadata.identifier', 'metadata.modified'), ('Legal information', 'project', 'project.funding.funder', 'project.funding.grantNumber', 'rightsHolder.name'), ['Other information'] + md.keys()): print '

%s\n

' % group[0] for element in group[1:]: edit_element(element, md) print '%s

' % \ html_submit('change_metadata', 'Make all changes') end_html() def change_metadata(form): from datetime import datetime md = read_metadata(form) for key in form: if not key.startswith('add_'): continue value = form.getfirst(key).decode('utf-8') if not value: continue realkey = strip_prefix(key, 'add_') old = md.get(realkey, []) if is_multielement(realkey): vals = [val.strip() for val in value.split(';')] else: vals = [value] md[realkey] = old + vals md['metadata.modified'] = \ [unicode(datetime.today().isoformat())] redirect(url_of('edit_metadata', md)) def output_dct(form): md = read_metadata(form) print 'content-type: text/xml; charset=utf-8\n' print '' print '''''' for key in md: root = key.split('.')[0] if key.endswith('.language'): continue if 'affiliation' in key: continue for value in md[key]: field, scheme, value = get_dct(root, key, value, md) print '%s' % (field, scheme, inhtml(value), field) print '' def output_nrdxml(form): print 'content-type: application/rdf+xml\n' print make_nrd(read_metadata(form), 'xml') def output_nrdn3(form): print 'content-type: text/rdf+n3\n' print make_nrd(read_metadata(form), 'n3') output_rdf = stub_method output_datacite = stub_method def test_headers(form): cgi.test() ## Script logic def handle_request(form): handlers = { 'ask_for_file': ask_for_file, 'fetch_file': fetch_file, 'receive_file': receive_file, 'gather_metadata': gather_metadata, 'edit_metadata': edit_metadata, 'change_metadata': change_metadata, 'output_dct': output_dct, 'output_nrdxml': output_nrdxml, 'output_nrdn3': output_nrdn3, 'output_datacite': output_datacite, 'test': test_headers, } state = form.getfirst('state', 'ask_for_file') try: handler = handlers[state] except KeyError: handler = edit_metadata handler(form) if __name__ == '__main__': try: handle_request(cgi.FieldStorage()) except: print_header() cgi.print_exception()