import re
from operator import itemgetter
import jsonref
from ocdskit.exceptions import MissingColumnError
from ocdskit.schema import get_schema_fields
from ocdskit.util import _cast_as_list
# See https://stackoverflow.com/questions/30734682/extracting-url-and-anchor-text-from-markdown-using-python
INLINE_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
[docs]
def mapping_sheet(
schema,
*,
order_by=None,
infer_required=False,
extension_field=None,
inherit_extension=True,
include_codelist=False,
include_deprecated=True,
include_definitions=False,
base_uri=None,
):
"""
Return information about all field paths in a JSON Schema, as columns and rows.
If ``include_definitions=False``, this function resolves ``$ref`` properties.
:param dict schema: a JSON schema
:param str order_by: the column by which to sort the rows
:param bool infer_required: whether to infer that a field is required if "null" is not in its ``type``
:param str extension_field: the property in the JSON schema containing the name of the extension in which each
field was defined
:param bool inherit_extension: whether fields defined via $ref properties inherit the "extension" value of their
parent field
:param bool include_codelist: whether to include a "codelist" column
:param bool include_deprecated: whether to include any deprecated fields
:param bool include_definitions: whether to traverse the "$defs" and/or "definitions" properties
:param str base_uri: the URL to resolve relative references against
:returns: information about all field paths in a JSON Schema, as columns and rows
:rtype: tuple
The columns are:
:``section``: The first part of the JSON path to the field in the data, e.g. ``tender``
:``path``: The JSON path to the field in the data, e.g. ``tender/id``
:``title``: The field's ``title`` in the JSON schema. If the field has no ``title``, defaults to the field's name
followed by "*".
:``description``: The field's ``description`` in the JSON schema. URLs are removed (see the ``links`` column).
:``type``: A comma-separated list of the field's ``type`` in the JSON schema, excluding "null". If the field has no
``type``, defaults to "unknown".
:``range``: The field's allowed number of occurrences.
* "0..1" if the field defines an optional literal value.
* "0..n" if the field defines an optional array.
* "1..1" if the field defines a required literal value.
* "1..n" if the field defines a required array.
:``values``: If the field's schema sets:
* ``format``: the ``format``
* ``pattern``: the ``pattern``
* ``enum``: "Enum: " followed by the ``enum`` as a comma-separated list, excluding ``null``
* ``items/enum``: "Enum: " followed by the ``items/enum`` as a comma-separated list, excluding ``null``
:``codelist``: The field's ``codelist`` in the JSON schema
:``links``: The URLs extracted from the field's ``description``
:``deprecated``: The OCDS minor version in which the field (or its parent) was deprecated
:``deprecationNotes``: The explanation for the deprecation of the field
:``extension``: The name of the extension that introduced the JSON path (see the ``extension_field`` parameter)
:raises MissingColumnError: if the column by which to order is missing
"""
kwargs = {
"inherit_extension": inherit_extension,
"include_codelist": include_codelist,
"include_deprecated": include_deprecated,
}
if not include_definitions:
# jsonref.JsonRef is deprecated, but used for backwards-compatibility with jsonref 0.x.
# The defaults proxies=True and merge_props=False are needed to have two rows for each `$ref`.
schema = jsonref.JsonRef.replace_refs(schema, base_uri=base_uri, jsonschema=True)
rows = []
rows_by_path = {}
for field in get_schema_fields(schema):
if not include_definitions and field.definition:
continue
prop = field.schema
field.sep = "/"
extension_name = prop.get(extension_field)
# If the schema sets a `$ref` property, add an extra row for it. This preserves any differences in the titles
# and descriptions of the referrer and referee. The new row can be formatted as a heading for the object.
if hasattr(prop, "__reference__"):
reference = dict(prop.__reference__)
prop = dict(prop)
if extension_field in reference:
extension_name = reference[extension_field]
if "type" not in reference and "type" in prop:
reference["type"] = prop["type"]
_add_row(rows, rows_by_path, field, reference, extension_name, infer_required=infer_required, **kwargs)
_add_row(rows, rows_by_path, field, prop, extension_name, infer_required=infer_required, **kwargs)
# If the field is an array, add an extra row for it. This makes it easier to use as a header for the object.
if "items" in prop and "properties" in prop["items"] and "title" in prop["items"]:
row = {
"path": field.path,
"title": prop["items"]["title"],
"description": prop["items"].get("description", ""),
"type": prop["items"]["type"],
"deprecated": field.deprecated.get("deprecatedVersion"), # deprecation from parent
}
_add_deprecated(row, prop["items"])
_add_row(rows, rows_by_path, field, prop["items"], extension_name, row=row, **kwargs)
if order_by:
try:
rows.sort(key=itemgetter(order_by))
except KeyError as e:
raise MissingColumnError(f"the column '{order_by}' doesn't exist - did you make a typo?") from e
columns = [
"section",
"path",
"title",
"description",
"type",
"range",
"values",
"links",
"deprecated",
"deprecationNotes",
]
if extension_field:
columns.append("extension")
if include_codelist:
columns.append("codelist")
return columns, rows
def _add_deprecated(row, schema):
# OCDS for PPPs sets `"deprecated": null`.
if schema.get("deprecated"):
row["deprecated"] = schema["deprecated"].get("deprecatedVersion", "")
row["deprecationNotes"] = schema["deprecated"].get("description", "")
def _add_row(
rows,
rows_by_path,
field,
schema,
extension_name,
*,
infer_required=None,
inherit_extension=True,
include_codelist=False,
include_deprecated=True,
row=None,
):
parent = rows_by_path.get(field.path_components[:-1], {})
if not row:
row = _make_row(field, schema, infer_required, include_codelist)
if extension_name:
row["extension"] = extension_name
elif "extension" in parent and inherit_extension:
row["extension"] = parent["extension"]
if include_deprecated or not row["deprecated"]:
rows.append(row)
rows_by_path[field.path_components] = row
def _make_row(field, schema, infer_required, include_codelist):
row = {
"path": field.path,
"title": schema.get("title", field.path_components[-1] + "*"),
"deprecated": field.deprecated.get("deprecatedVersion"), # deprecation from parent
}
if len(field.path_components) > 1:
row["section"] = field.path_components[0]
else:
row["section"] = field.definition
if "description" in schema:
links = dict(INLINE_LINK_RE.findall(schema["description"]))
row["description"] = schema["description"]
for key, link in links.items():
row["description"] = row["description"].replace("[" + key + "](" + link + ")", key)
row["links"] = ", ".join(links.values())
required = False
if "type" in schema:
types = _cast_as_list(schema["type"])
if "null" in types:
types.remove("null")
elif infer_required:
required = "string" in types or "integer" in types
row["type"] = ", ".join(types)
else:
row["type"] = "unknown"
if field.required:
required = True
min_range = "1" if required else "0"
max_range = "n" if row["type"] == "array" else "1"
row["range"] = f"{min_range}..{max_range}"
if "format" in schema:
row["values"] = schema["format"]
elif "pattern" in schema:
row["values"] = "Pattern: " + schema["pattern"]
elif "enum" in schema:
values = list(schema["enum"])
if None in values:
values.remove(None)
row["values"] = "Enum: " + ", ".join(values)
elif "items" in schema and "enum" in schema["items"]:
values = list(schema["items"]["enum"])
if None in values:
values.remove(None)
row["values"] = "Enum: " + ", ".join(values)
else:
row["values"] = ""
if include_codelist:
row["codelist"] = schema.get("codelist")
_add_deprecated(row, schema)
return row