Generating Metadata (UDR)

This page provides an overview of APIs related to generating metadata.

View natively generated metadata from source data assets in the form of Universal Data Representation (UDR). UDR is used as the foundational data representation by Lexi, enabling both simple and advanced search capabilities.

Metadata within UDR documents includes:

  • The list of key terms identified within the data asset and their frequency
  • The full list of terms identified within the data asset
  • The inferred schema, when the supplied content type implies a schema is present
  • When a schema is present, a flattened representation of the document to simplify queries
  • The postings, i.e. an inverted index over the document, including terms, frequencies, and absolute and relative positions of each

Within the processing pipeline, semantic cell extraction is handled as a separate process, and the results are appended to UDR documents.

To generate a UDR document for a given data asset, you must first know its data type through type detection. Then, call POST /v1.0/tenants/[tenant-guid]/processing/udr to the document processor server, which by default lists on port 8000.

{
    "GUID": "00000000-0000-0000-0000-000000000000",
    "Key": "testfile.text",
    "ContentType": "text/plain",
    "Type": "Text",
    "IncludeFlattened": true,
    "CaseInsensitive": true,
    "TopTerms": 10,
    "AdditionalData": "The body below is simple sample text, base64 encoded, taken from https://en.wikipedia.org/wiki/Artificial_intelligence.",
    "Metadata": {
        "foo": "bar"
    },
    "MetadataRule": {
        "GUID": "00000000-0000-0000-0000-000000000000",
        "TenantGUID": "00000000-0000-0000-0000-000000000000",
        "BucketGUID": "00000000-0000-0000-0000-000000000000",
        "OwnerGUID": "00000000-0000-0000-0000-000000000000",
        "Name": "My metadata rule",
        "ContentType": "text/plain",
        "UdrEndpoint": "http://localhost:8000/",
        "DataCatalogType": "Lexi",
        "DataCatalogEndpoint": "http://localhost:8000/",
        "DataCatalogCollection": "00000000-0000-0000-0000-000000000000",
        "TopTerms": 10,
        "CaseInsensitive": true,
        "IncludeFlattened": true
    },
    "Data": "QXJ0aWZpY2..."
}
curl --location 'http://view.homedns.org:8000/v1.0/tenants/00000000-0000-0000-0000-000000000000/processing/udr' \
--header 'Content-Type: application/json' \
--header 'Authorization: ••••••' \
--data '{
    "GUID": "00000000-0000-0000-0000-000000000000",
    "Key": "testfile.text",
    "ContentType": "text/plain",
    "Type": "Text",
    "IncludeFlattened": true,
    "CaseInsensitive": true,
    "TopTerms": 10,
    "AdditionalData": "The body below is simple sample text, base64 encoded, taken from https://en.wikipedia.org/wiki/Artificial_intelligence.",
    "Metadata": {
        "foo": "bar"
    },
    "MetadataRule": {
        "GUID": "00000000-0000-0000-0000-000000000000",
        "TenantGUID": "00000000-0000-0000-0000-000000000000",
        "BucketGUID": "00000000-0000-0000-0000-000000000000",
        "OwnerGUID": "00000000-0000-0000-0000-000000000000",
        "Name": "My metadata rule",
        "ContentType": "text/plain",
        "UdrEndpoint": "http://localhost:8000/",
        "DataCatalogType": "Lexi",
        "DataCatalogEndpoint": "http://localhost:8000/",
        "DataCatalogCollection": "00000000-0000-0000-0000-000000000000",
        "TopTerms": 10,
        "CaseInsensitive": true,
        "IncludeFlattened": true
    },
    "Data": "QXJ0aWZpY2..."
}'
import { ViewProcessorSdk } from "view-sdk";

const processor = new ViewProcessorSdk(
  "00000000-0000-0000-0000-000000000000", //tenant Id
  "default", //access token
  "http://localhost:8000/" //endpoint
);

const generateUDR = async () => {
  try {
    const response = await processor.generateUdr({
      GUID: "00000000-0000-0000-0000-000000000000",
      Key: "testfile.text",
      ContentType: "text/plain",
      Type: "Text",
      IncludeFlattened: true,
      CaseInsensitive: true,
      TopTerms: 10,
      AdditionalData:
        "The body below is simple sample text, base64 encoded, taken from https://en.wikipedia.org/wiki/Artificial_intelligence.",
      Metadata: {
        foo: "bar",
      },
      MetadataRule: {
        GUID: "00000000-0000-0000-0000-000000000000",
        TenantGUID: "00000000-0000-0000-0000-000000000000",
        BucketGUID: "00000000-0000-0000-0000-000000000000",
        OwnerGUID: "00000000-0000-0000-0000-000000000000",
        Name: "My metadata rule",
        ContentType: "text/plain",
        UdrEndpoint: "http://localhost:8000/",
        DataCatalogType: "Lexi",
        DataCatalogEndpoint: "http://localhost:8000/",
        DataCatalogCollection: "00000000-0000-0000-0000-000000000000",
        TopTerms: 10,
        CaseInsensitive: true,
        IncludeFlattened: true,
      },
      Data: "QXJ0aWZpY2lh...",
    });
    console.log(response);
  } catch (err) {
    console.log("Error", err);
  }
};
generateUDR();
import view_sdk
from view_sdk import processor

sdk = view_sdk.configure( access_key="default",base_url="localhost", tenant_guid= "00000000-0000-0000-0000-000000000000")

def udrGeneration():
    result = processor.UdrGenerator.generate(
        GUID= "00000000-0000-0000-0000-000000000000",
        Key= "testfile.text",
        ContentType= "text/plain",
        Type= "Text",
        IncludeFlattened= True,
        CaseInsensitive= True,
        TopTerms= 10,
        AdditionalData= "The body below is simple sample text, base64 encoded, taken from https://en.wikipedia.org/wiki/Artificial_intelligence.",
        Metadata= {
            "foo": "bar"
        },
        MetadataRule= {
            "GUID": "00000000-0000-0000-0000-000000000000",
            "TenantGUID": "00000000-0000-0000-0000-000000000000",
            "BucketGUID": "00000000-0000-0000-0000-000000000000",
            "OwnerGUID": "00000000-0000-0000-0000-000000000000",
            "Name": "My metadata rule",
            "ContentType": "text/plain",
            "UdrEndpoint": "http://localhost:8000/",
            "DataCatalogType": "Lexi",
            "DataCatalogEndpoint": "http://localhost:8000/",
            "DataCatalogCollection": "00000000-0000-0000-0000-000000000000",
            "TopTerms": 10,
            "CaseInsensitive": True,
            "IncludeFlattened": True
        },
        Data= "QXJ0aWZpY2lhbCBp..."
    )
    print(result)

udrGeneration()

The response body will contain a fully-populated UDR document.

{
    "GUID": "00000000-0000-0000-0000-000000000000",
    "Success": true,
    "Timestamp": {
        "Start": "2025-04-30T12:54:18.561659Z",
        "End": "2025-04-30T12:54:18.618885Z",
        "TotalMs": 57.23,
        "Messages": {}
    },
    "AdditionalData": "The body below is simple sample text, base64 encoded, taken from https://en.wikipedia.org/wiki/Artificial_intelligence.",
    "Metadata": {
        "foo": "bar"
    },
    "Key": "testfile.text",
    "Type": "Text",
    "Terms": [
        "Artificial",
        "intelligence",
        "broadest",
    ],
    "TopTerms": {
        "intelligence": 3,
        "machines": 3,
        "applications": 3
    },
    "Schema": {
        "Type": "Text",
        "Schema": {},
        "Metadata": {},
        "Flattened": []
    },
    "Postings": [
        {
            "Term": "Artificial",
            "Count": 1,
            "AbsolutePositions": [
                0
            ]
        },
        {
            "Term": "anymore",
            "Count": 1,
            "AbsolutePositions": [
                96
            ]
        }
    ],
    "SemanticCells": []
}