Introduction

In this tutorial we will explain:

How to get bulk statistics on WellLog data created
How to fetch WellLog bulk statistics code examples

Note: Computable WellLog's curves data types are: integer, float and date/datetime.

Computable WellLog's curves data types are: integer, float and date/datetime.

Prerequisites

Required Python packages

Before to start to write bulk data through Wellbore DDMS API's, you will need to install the Python packages below:

The pandas module and its Pandas.Dataframe json format to structure log bulk data to be written to the Wellbore DDMS.
The pyarrow module to transform Pandas.Dataframe to parquet file through the pyarrow engine.
The httpx module that allows to post request to the Wellbore DDMS.

# Prerequisite to run this notebook
!python -m pip install pip --upgrade
!pip install pandas numpy httpx pyarrow

Authorization

For any call to Wellbore DDMS API's you need to pass into the header of the request a valid bearer token. This token can be obtained from any API catalog on the developer portal. You will need first to request a developer base subscription. Then from the developer base subscription pick any API and execute it. A valid bearer token is returns in the Curl section of the response. Copy this token value and assign it to the TOKEN variable below.

TOKEN = '' # Paste here the token without the bearer prefix

Utility methods

Helper functions used in the different sample scripts of this tutorial.

from typing import List
import httpx
import pandas as pd
import numpy as np
import io
from IPython.display import display_html, display, HTML
from itertools import chain, cycle

def generate_df_typed(columns, index):
    def gen_values(col_name, size):
        if col_name.startswith('float'):
            return np.random.random_sample(size=size)
        if col_name.startswith('str'):
            return [f'string_value_{i}' for i in range(size)]
        if col_name.startswith('bool'):
            return np.random.choice(a=[False, True], size=size) 
        if col_name.startswith('date'):
            return (np.datetime4('2021-01-01') + days for days in range(size))
        return np.random.randint(-100, 1000, size=size)

    df = pd.DataFrame({c: gen_values(c, len(index))
                      for c in columns}, index=index)
    return df

def multi_table(table_list):
    '''Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell'''
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>'
    )

def gen_color(color):
    def fct(val=None):
         return f'color: {color}'
    return fct

def display_operation(before, sent, after):
    colors = ['blue', 'green', 'orange', 'red']
    color_fct = [gen_color(c) for c in colors]
    sent_st = [sent[i].style.set_caption(f'chunk {i+1} sent').applymap(color_fct[i]) for i in range(len(sent))]
    def color_output(s):
        res = []
        for r in s.index:
            c = ''
            for i in range(len(sent)):
                if s.name in sent[i] and int(r) in sent[i][s.name]:
                    c = color_fct[i]()#f'color: {colors[i]}'
            res.append(c)
        return res

    margin = '65'
    after_st = after.style.set_table_attributes(f"style='margin-left:{margin}px'").apply(color_output).highlight_null(null_color='lightyellow').set_caption('Final data - After session commit')   
    display(multi_table([before.style.set_table_attributes(f"style='margin-right:{margin}px'").set_caption('Initial data - Before session'), *sent_st, after_st]))
    
def display_side_by_side(dfs:list, captions:list):
    """Display tables side by side to save vertical space
    Input:
        dfs: list of pandas.DataFrame
        captions: list of table captions
    """
    output = ""
    combined = dict(zip(captions, dfs))
    for caption, df in combined.items():
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
        output += "\xa0\xa0\xa0"
    display(HTML(output))
    
def generate_df(columns: List[str], index):
    nbrows = len(index)
    df = pd.DataFrame(
        np.random.randint(-100, 1000, size=(nbrows, len(columns))), index=index)
    df.columns= columns
    return df


def print_response(resp):
    print(f'{resp.request.method} : {resp.url} -> {resp.status_code}')
    if resp.status_code != httpx.codes.OK:
        display(resp.content)

        
def create_df_from_response(response):
    """Returns a dataframe created from the WellLog bulk data response
    Input:
        response: a httpx.response object
    Output:
        dataframe: a pandas.dataframe object
    """
    content_type = response.headers.get('content-type')
    
    if content_type == 'application/json':
        return pd.DataFrame.from_dict(response.json())
    
    elif content_type == 'application/x-parquet':
        f = io.BytesIO(response.content)
        f.seek(0)
        return pd.read_parquet(f)
    
    raise ValueError(f"Unknown content-type: '{content_type}'")
    
def display_previous_and_current_well_log_data_versions(record_id):
    """Display the previous and current WellLog data versions for a given record id and highlight differences between them.
    Input:
        record_id: a WellLog record id
    """
    # list record version
    results_response = client.get(f'{welllog_dms_url}/{record_id}/versions')
    wellLog_versions_response = results_response.json()
    versions = wellLog_versions_response['versions']
    
    is_previous_results = False
    is_current_results = False
    if len(versions) >= 2:
        previous_version_id = versions[len(versions)-2]
        curl = f'{welllog_dms_url}/{record_id}/versions/{previous_version_id}/data'
        results_response = client.get(curl)
        if results_response.status_code == 200:
            previous_results = create_df_from_response(results_response)
            is_previous_results = True
    
        current_version_id = versions[len(versions)-1]
        curl = f'{welllog_dms_url}/{record_id}/versions/{current_version_id}/data'
        results_response = client.get(curl)
        if results_response.status_code == 200:
            current_results = create_df_from_response(results_response)
            is_current_results = True
    
    colors = ['blue', 'red']
    color_fct = [gen_color(c) for c in colors]
    def color_output(s):
        res = []        
        for r in s.index:
            c = ''
            if s.name in previous_results and int(r) in previous_results[s.name]:
                c = color_fct[0]()
            else:
                c = color_fct[1]()
            res.append(c)
        return res
    
    margin = '65'
    tables = []
    if is_previous_results:
        previous_results_st = previous_results.style.set_table_attributes(f"style='margin-left:{margin}px'").highlight_null(null_color='lightyellow').set_caption('Previous WellLog data version').applymap(color_fct[0])  
        tables.append(previous_results_st)
        
    if is_current_results:
        if is_previous_results:
            current_results_st = current_results.style.set_table_attributes(f"style='margin-left:{margin}px'").apply(color_output).highlight_null(null_color='lightyellow').set_caption('Current WellLog data version with data chunks added in red')
            tables.append(current_results_st)
        else:
            current_results_st = current_results.style.set_table_attributes(f"style='margin-left:{margin}px'").highlight_null(null_color='lightyellow').set_caption('Current WellLog data version') 
            tables.append(current_results_st)
        
    display(multi_table(tables))

Settings

Several settings as the base url end-point and the data partition id to create a WellLog to the Wellbore DDMS. Please change those settings accordingly to the environment settings that you want to target.

base_url = "" # set a base URL value
data_partition_id = "" # set a data partition id
legal_tag = "" # set a valid legal tag in the data partition 
acl_domain = "" # set an Access Control Lists (ACL) domain

welllog_dms_url = f'{base_url}/api/os-wellbore-ddms/ddms/v3/welllogs'

client = httpx.Client(verify=False,
    headers={
        "data-partition-id": f"{data_partition_id}",
        "Authorization": f"Bearer {TOKEN}",
    },
    timeout=120
)

# Create a new WellLog. Here is a fake body just to illustrate the API use
record = {
    "kind": "osdu:wks:work-product-component--WellLog:1.0.0",
    "acl": {
        "viewers": [f"data.default.viewers@{data_partition_id}.{acl_domain}"],
        "owners": [f"data.default.owners@{data_partition_id}.{acl_domain}"]
      },
    "legal": {
        "legaltags": [f"{legal_tag}"],
        "otherRelevantDataCountries": ["US"],
    },
    "data": {""
        "WellboreID": "namespace:master-data--Wellbore:SomeUniqueWellboreID:",
        "Curves": [
            {
                "CurveID": "MD",
                "NumberOfColumns": 1
            },
            {
                "CurveID": "X",
                "NumberOfColumns": 1
            }
        ]
    },
}

Create a WellLog record

The script below is creating a WellLog record that is used in this tutorial to demonstrate how to write WellLog bulk data to the Wellbore DDMS.

response = client.post(welllog_dms_url, json=[record])
print_response(response)
record_id = response.json()["recordIds"][0]
record_id

2. How to trigger WellLog bulk data statistics computation

Current limitations to manually trigger the WellLog bulk data statistics computation are:

In case of error, a delay of 1h before triggering the computation again for the same WellLog data.
A maximum of 3 attempts to compute statistics for a given WellLog data is allowed.

Create WellLog record

record = {
    "kind": f"{data_partition}:wks:work-product-component--WellLog:1.0.0",
    "acl": {
        "viewers": [f"data.default.viewers@{data_partition}.{acl_domain}"],
        "owners": [f"data.default.owners@{data_partition}.{acl_domain}"]
      },
    "legal": {
        "legaltags": [legal_tag],
        "otherRelevantDataCountries": ["US"],
    },
    "data": {
        "Curves": [
            {"CurveID": 'int-A'},
            {"CurveID": 'int-A-with-nan'},
            {"CurveID": 'float-B'},
            {"CurveID": 'float-B-with-nan'}, 
            {"CurveID": 'date-C'}, 
            {"CurveID": 'date-C-with-nan'},
            {"CurveID": 'bool-D'},
            {"CurveID": 'string-E'},
        ]
    },
}

response_typed = client.post(welllog_url, json=[record])
print_response(response_typed)
types_col_record_id = response_typed.json()["recordIds"][0]
print(types_col_record_id)

POST : https://<ENVIRONMENT_BASE_URL>/api/os-wellbore-ddms/ddms/v3/welllogs -> 200

POST WellLog data

columns = ['int-A', 'int-A-with-nan', 'float-B', 'float-B-with-nan', 'bool-D', 'string-E', 'date-C', 'date-C-with-nan']

different_type_df = generate_df_typed(columns, range(100_000))
add_nan_values_to_df(different_type_df)
display(different_type_df)

data_to_send = different_type_df.to_parquet(engine='pyarrow')

write_response = client.post(f'{welllog_url}/{types_col_record_id}/data', data=data_to_send, headers={'content-type': 'application/parquet'})
print_response(write_response)

	int-A	int-A-with-nan	float-B	float-B-with-nan	bool-D	string-E	date-C	date-C-with-nan
0	868	592.0	0.792575	0.113692	False	string_value_0	2021-01-01 00:00:00.000	2021-01-01 00:00:00.000
1	222	624.0	0.529602	0.047647	True	string_value_1	2021-01-01 00:00:00.001	NaT
2	842	359.0	0.184516	0.783715	True	string_value_2	2021-01-01 00:00:00.002	NaT
3	879	280.0	0.526019	0.288487	False	string_value_3	2021-01-01 00:00:00.003	NaT
4	456	619.0	0.512207	0.373447	True	string_value_4	2021-01-01 00:00:00.004	2021-01-01 00:00:00.004
...	...	...	...	...	...	...	...	...
99995	560	220.0	0.714021	0.064975	False	string_value_99995	2021-01-01 00:01:39.995	2021-01-01 00:01:39.995
99996	861	78.0	0.663497	0.560253	False	string_value_99996	2021-01-01 00:01:39.996	2021-01-01 00:01:39.996
99997	916	100.0	0.687259	NaN	False	string_value_99997	2021-01-01 00:01:39.997	2021-01-01 00:01:39.997
99998	354	933.0	0.563194	0.921421	False	string_value_99998	2021-01-01 00:01:39.998	2021-01-01 00:01:39.998
99999	-14	812.0	0.976996	0.644540	True	string_value_99999	2021-01-01 00:01:39.999	NaT

Note: NaT means Not a Time, equivalent of NaN for dates.

POST : https://<ENVIRONMENT_BASE_URL>/api/os-wellbore-ddms/ddms/v3/welllogs/osdu:work-product-component--WellLog:ac6ec4b8074941b19c4723b1dbdc0da9/data -> 200

Trigger bulk statistics computation

Computation can be manually triggered if WellLog was created before M12 release or in case of computation error.

API to use: POST /ddms/v3/welllogs/{record_id}/versions/{version}/data/statistics

NOTE: record id and record version are required.

welllog_meta_response = client.get(f'{welllog_url}/{record_id}')
print_response(welllog_meta_response)
record_version = welllog_meta_response.json()['version']

post_welllog_stats_response = client.post(f'{welllog_url}/{record_id}/versions/{record_version}/data/statistics', headers={'content-type': 'application/parquet'})
print_response(post_welllog_stats_response)

GET : https://<ENVIRONMENT_BASE_URL>/api/os-wellbore-ddms/ddms/v3/welllogs/osdu:work-product-component--WellLog:89bd0debbcf1411fb240d0a906da7cd4 -> 200 POST : https://<ENVIRONMENT_BASE_URL>/api/os-wellbore-ddms/ddms/v3/welllogs/osdu:work-product-component--WellLog:89bd0debbcf1411fb240d0a906da7cd4/versions/1653990573032433/data/statistics -> 200

3. How to fetch WellLog bulk data statistics already computed

Please, note that WellLog's curves with string and boolean data types are not computed

Display statistics data as JSON

# Leave the parameter "curves" empty will select all the WellLog's curves statistics available
select_all_curves_param = {}

 # To select only curves: 'int-A', 'float-B' and 'date-C' from computed statistics
select_specific_curves_params = {"curves": "int-A,float-B,date-C"}

%time post_welllog_stats_response = client.get(f'{welllog_url}/{types_col_record_id}/data/statistics', param=select_all_curves_param)
print_response(post_welllog_stats_response)

json_posted_stats = post_welllog_stats_response.json()
display(json_posted_stats)

CPU times: total: 141 ms Wall time: 2.01 s GET : https://<ENVIRONMENT_BASE_URL>/api/os-wellbore-ddms/ddms/v3/welllogs/osdu:work-product-component--WellLog:ac6ec4b8074941b19c4723b1dbdc0da9/data/statistics -> 200

{'computationStartDatetime': '2022-05-31T09:17:08.431223', 'recordId': 'osdu:work-product-component--WellLog:ac6ec4b8074941b19c4723b1dbdc0da9', 'recordVersion': 1653988624188709, 'computationStatus': 'complete', 'data': {'int-A-with-nan': {'mean': '449.8925764705882', 'std': '317.8241340241287', 'min': '-100.0', '10%': '10.0', '50%': '450.0', '90%': '890.0', 'max': '999.0', 'totalCount': '100000', 'nonAbsentValuesCount': '85000.0'}, 'int-A': {'mean': '449.63618', 'std': '317.49388998642064', 'min': '-100.0', '10%': '9.0', '50%': '451.0', '90%': '890.0', 'max': '999.0', 'totalCount': '100000', 'nonAbsentValuesCount': '100000.0'}, 'date-C': {'mean': '2021-01-01 00:00:49.999499776', 'std': 'NaN', 'min': '2021-01-01 00:00:00', '10%': '2021-01-01 00:00:09.999899904', '50%': '2021-01-01 00:00:49.999500032', '90%': '2021-01-01 00:01:29.999100160', 'max': '2021-01-01 00:01:39.999000', 'totalCount': '100000', 'nonAbsentValuesCount': '100000'}, 'date-C-with-nan': {'mean': '2021-01-01 00:00:49.995083776', 'std': 'NaN', 'min': '2021-01-01 00:00:00', '10%': '2021-01-01 00:00:10.055899904', '50%': '2021-01-01 00:00:49.966500096', '90%': '2021-01-01 00:01:29.970099968', 'max': '2021-01-01 00:01:39.998000', 'totalCount': '100000', 'nonAbsentValuesCount': '85000'}, 'float-B-with-nan': {'mean': '0.4981784333593075', 'std': '0.2888887958106551', 'min': '3.56818820279603e-06', '10%': '0.09984358734025209', '50%': '0.4974533393916296', '90%': '0.8997088186446737', 'max': '0.9999888136429178', 'totalCount': '100000', 'nonAbsentValuesCount': '85000.0'}, 'float-B': {'mean': '0.4980908954221541', 'std': '0.2885563043838143', 'min': '1.4538739944169876e-06', '10%': '0.09894710442338889', '50%': '0.4969224934210345', '90%': '0.8987151676695218', 'max': '0.9999918891377975', 'totalCount': '100000', 'nonAbsentValuesCount': '100000.0'}}}

Display statistics data in a Dataframe

json_posted_stats_copy = response_copy_without_data(post_welllog_stats_response)
display(json_posted_stats_copy)

create_df_from_dict(post_welllog_stats_response)

	mean	std	min	10%	50%	90%	max	totalCount	nonAbsentValuesCount
int-A-with-nan	449.8925764705882	317.8241340241287	-100.0	10.0	450.0	890.0	999.0	100000	85000.0
date-C	2021-01-01 00:00:49.999499776	NaN	2021-01-01 00:00:00	2021-01-01 00:00:09.999899904	2021-01-01 00:00:49.999500032	2021-01-01 00:01:29.999100160	2021-01-01 00:01:39.999000	100000	100000
float-B-with-nan	0.4981784333593075	0.2888887958106551	3.56818820279603e-06	0.09984358734025209	0.4974533393916296	0.8997088186446737	0.9999888136429178	100000	85000.0
float-B	0.4980908954221541	0.2885563043838143	1.4538739944169876e-06	0.09894710442338889	0.4969224934210345	0.8987151676695218	0.9999918891377975	100000	100000.0
int-A	449.63618	317.49388998642064	-100.0	9.0	451.0	890.0	999.0	100000	100000.0
date-C-with-nan	2021-01-01 00:00:49.995083776	NaN	2021-01-01 00:00:00	2021-01-01 00:00:10.055899904	2021-01-01 00:00:49.966500096	2021-01-01 00:01:29.970099968	2021-01-01 00:01:39.998000	100000	85000

4. Fetch WellLog bulk statistics code examples

APIs:

GET /ddms/v3/welllogs/{record_id}/data/statistics
GET /ddms/v3/welllogs/{record_id}/versions/{version}/data/statistics

Select specifics curves statistics

As the GET bulk API, you can fetch statistics only for specifics curves

Fetch WellLog bulk information and retrieve columns from it

GET /ddms/v3/welllogs/{record_id}/data?describe=True

welllog_bulk_response_example_1 = client.get(f'{welllog_url}/{types_col_record_id}/data', params={'describe':True})
record_columns = welllog_bulk_response_example_1.json().get('columns', [])
display(welllog_bulk_response_example_1.json())

wanted_curves = [c for c in record_columns if c.startswith('float')]
print("\nWanted WellLog's curves", wanted_curves)

{'numberOfRows': 100000, 'columns': ['bool-D', 'date-C', 'date-C-with-nan', 'float-B', 'float-B-with-nan', 'int-A', 'int-A-with-nan', 'string-E']}

Wanted WellLog's curves ['float-B', 'float-B-with-nan']

Fetch WellLog bulk statistics of selected WellLog's curves

GET /ddms/v3/welllogs/{record_id}/data/statistics

# Generate list of wanted curves from previous describe=True API.
wanted_curves_params = {
    'curves': ','.join(wanted_curves)
}

welllog_stats_response_example_1 = client.get(f'{welllog_url}/{types_col_record_id}/data/statistics', params=wanted_curves_params)
print_response(welllog_stats_response_example_1)

json_posted_stats_example_1 = welllog_stats_response_example_1.json()
display(json_posted_stats_example_1)

{'curves': 'float-B,float-B-with-nan'} GET : https://<ENVIRONMENT_BASE_URL>/api/os-wellbore-ddms/ddms/v3/welllogs/osdu:work-product-component--WellLog:ac6ec4b8074941b19c4723b1dbdc0da9/data/statistics?curves=float-B%2Cfloat-B-with-nan -> 200

Fetch WellLog bulk statistics of anterior WellLog bulk version

List WellLog anterior versions

GET /ddms/v3/welllogs/{welllogId}/versions

welllog_record_response_example_2 = client.get(f'{welllog_url}/{types_col_record_id}/versions')
record_versions = welllog_record_response_example_2.json().get('versions', [])

print("record_versions:", record_versions)
last_record_version = record_versions[-1]

record_versions: [1653988602340449, 1653988624188709]

Then, fetch WellLog bulk statistics at specified version

GET /ddms/v3/welllogs/{record_id}/versions/{version}/data/statistics

wanted_curves_params = {
    'curves': ','.join(wanted_curves)
}

record_version = last_record_version

welllog_stats_response_example_2 = client.get(f'{welllog_url}/{types_col_record_id}/versions/{record_version}/data/statistics', params=wanted_curves_params)
print_response(welllog_stats_response_example_2)

json_posted_stats_example_2 = welllog_stats_response_example_2.json()
display(json_posted_stats_example_2)

GET : https://<ENVIRONMENT_BASE_URL>/api/os-wellbore-ddms/ddms/v3/welllogs/osdu:work-product-component--WellLog:ac6ec4b8074941b19c4723b1dbdc0da9/versions/1653988624188709/data/statistics?curves=float-B%2Cfloat-B-with-nan -> 200