"""
================================
Web Viewer for CMIP6 Data Tables
================================
We provide a Streamlit web application that provides an interface
for interacting with CMIP6 (Coupled Model Intercomparison Project Phase 6)
data tables.
Launch it from the command line with::
$ pycmor table-explorer
.. image:: images/table-explorer.png
The application allows users to load these tables from three
different sources: GitHub, a local directory, or directly from
the user's laptop. The tables are JSON files containing metadata
about climate model outputs.
The main features of the application are:
1. **Table Source Selection**: Users can select the source of
the tables. The options are 'github', 'Local', and 'Laptop'.
Depending on the selection, the user can provide a URL (for
GitHub), a directory path (for Local), or upload files
(for Laptop).
2. **Table Processing**: The application processes each table,
extracting key information such as table ID, frequency, and
variable entries. Tables that do not contain variable entries
or frequency are added to an ignore list.
3. **Variable Selection and Display**: Users can select a variable
from the processed tables. The application then displays all tables
and frequencies where this variable is found, along with additional
information such as the time method (Instantaneous, Climatology, or Mean).
4. **Metrics Display**: The application displays metrics about the
processed tables, including the number of tables, frequencies,
and variables.
5. **Ignored Tables**: The application provides an expander to view
all ignored tables.
The application uses multithreading to load and process tables from
GitHub, improving performance when dealing with a large number of tables.
This module contains several functions:
- **`process_table(tbl_name: str, data: dict)`**: Processes a single table,
extracting key information and
updating global data structures.
- **`show_selected_variable(varname)`**: Displays information about
the selected variable.
- **`load_data_from_github(f, ctx)`**: Loads a single table from GitHub.
The application uses several global data structures to store information
about the tables and variables, including `tbls`, `tbl_raw_data`,
`var_to_tbl`, `frequencies`, `tids`, and `ignored_table_files`.
"""
import json
import os
import socket
import threading
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import pandas as pd
import requests
import streamlit as st
github_url = "https://raw.githubusercontent.com/PCMDI/cmip6-cmor-tables/main/Tables/"
table_files = {
"CMIP6_3hr.json",
"CMIP6_6hrLev.json",
"CMIP6_6hrPlev.json",
"CMIP6_6hrPlevPt.json",
"CMIP6_AERday.json",
"CMIP6_AERhr.json",
"CMIP6_AERmon.json",
"CMIP6_AERmonZ.json",
"CMIP6_Amon.json",
"CMIP6_CF3hr.json",
"CMIP6_CFday.json",
"CMIP6_CFmon.json",
"CMIP6_CFsubhr.json",
"CMIP6_CV.json",
"CMIP6_E1hr.json",
"CMIP6_E1hrClimMon.json",
"CMIP6_E3hr.json",
"CMIP6_E3hrPt.json",
"CMIP6_E6hrZ.json",
"CMIP6_Eday.json",
"CMIP6_EdayZ.json",
"CMIP6_Efx.json",
"CMIP6_Emon.json",
"CMIP6_EmonZ.json",
"CMIP6_Esubhr.json",
"CMIP6_Eyr.json",
"CMIP6_IfxAnt.json",
"CMIP6_IfxGre.json",
"CMIP6_ImonAnt.json",
"CMIP6_ImonGre.json",
"CMIP6_IyrAnt.json",
"CMIP6_IyrGre.json",
"CMIP6_LImon.json",
"CMIP6_Lmon.json",
"CMIP6_Oclim.json",
"CMIP6_Oday.json",
"CMIP6_Odec.json",
"CMIP6_Ofx.json",
"CMIP6_Omon.json",
"CMIP6_Oyr.json",
"CMIP6_SIday.json",
"CMIP6_SImon.json",
"CMIP6_coordinate.json",
"CMIP6_day.json",
"CMIP6_formula_terms.json",
"CMIP6_fx.json",
"CMIP6_grids.json",
"CMIP6_input_example.json",
}
ignored_table_files = {
"CMIP6_coordinate.json",
"CMIP6_grids.json",
"CMIP6_input_example.json",
"CMIP6_formula_terms.json",
"CMIP6_fx.json",
"CMIP6_CV.json",
}
tbls = defaultdict(list)
tbl_raw_data = {}
var_to_tbl = defaultdict(list)
frequencies = set()
tids = {}
[docs]
def process_table(tbl_name: str, data: dict):
add_to_ignore = False
t = data
tid = t.get("Header", {}).get("table_id", "").replace("Table ", "")
tids[tid] = tbl_name
if tid == "fx":
add_to_ignore = True
elif var_entry := t.get("variable_entry"):
for name, attrs in var_entry.items():
if freq := attrs.get("frequency"):
var_to_tbl[name].append((tid, freq))
tbls[tid].append((name, freq))
frequencies.add(freq)
else:
add_to_ignore = True
else:
add_to_ignore = True
if add_to_ignore:
ignored_table_files.add(tbl_name)
return
[docs]
def show_selected_variable(varname):
res = var_to_tbl[varname]
kind = ""
r = []
for t, f in res:
if f.endswith("Pt"):
kind = "Instantaneous"
elif f.endswith("C") or f.endswith("CM"):
kind = "Climatology"
else:
kind = "Mean"
r.append(dict(table=t, frequency=f, timemethod=kind)) # , select=False))
r = sorted(r, key=lambda x: x["table"])
df = pd.DataFrame(r)
event = st.dataframe(
df, on_select="rerun", selection_mode=["multi-row"], use_container_width=True
)
if event.selection:
indices = event.selection["rows"]
_tids = list(df.loc[indices].table)
attrs = []
for t in _tids:
tbl = tids[t]
info = {}
d = tbl_raw_data[tbl]
info.update(d["Header"])
info.update(d["variable_entry"][varname])
attrs.append(info)
if attrs:
df_info = pd.DataFrame(attrs, index=indices).T
def styler(row):
ncols = len(row)
if len(row.unique()) > 1:
return ["background-color: #eeecf4" for i in range(ncols)]
return ["background-color: white" for i in range(ncols)]
if len(df_info.columns) > 1:
st.dataframe(
df_info.style.apply(styler, axis=1), use_container_width=True
)
else:
st.dataframe(df_info, use_container_width=True)
return
st.set_page_config(layout="wide")
col1, col2 = st.columns([1, 3])
captions = ["raw githubusercontent", f"{socket.gethostname()}", ""]
with col1:
table_source = st.radio(
"Select table source",
["github", "Local", "Laptop"],
index=None,
captions=captions,
)
if table_source == "Laptop":
with col2:
tbl_files = st.file_uploader(
"Tables, select one or more tables:",
type="json",
accept_multiple_files=True,
)
for f in tbl_files:
tbl_name = f.name
if tbl_name in ignored_table_files:
continue
data = json.loads(f.read())
tbl_raw_data[tbl_name] = data
process_table(tbl_name, data)
if table_source == "github":
with col2:
url = st.text_input("Using the following url:", github_url)
message = """
For a different data_spec_version (01.00.32), replace 'main' with the version '01.00.32' (no quotes) in the url
"""
st.write(message)
if url.endswith("json"):
tbl_files = [url]
else:
tbl_files = [
(url.rstrip("/") + "/" + f)
for f in table_files
if f not in ignored_table_files
]
def load_data_from_github(f, ctx):
st.runtime.scriptrunner.add_script_run_ctx(threading.current_thread(), ctx)
tbl_name = os.path.basename(f)
if tbl_name in ignored_table_files:
return
r = requests.get(f)
r.raise_for_status()
data = json.loads(r.text)
tbl_raw_data[tbl_name] = data
process_table(tbl_name, data)
with ThreadPoolExecutor(8) as tpool:
# tpool.map(load_data_from_github, tbl_files)
ctx = st.runtime.scriptrunner.get_script_run_ctx()
futures = [tpool.submit(load_data_from_github, f, ctx) for f in tbl_files]
for future in as_completed(futures):
future.result()
if table_source == "Local":
with col2:
srcdir = st.text_input("Table directory:")
if srcdir:
srcdir = Path(srcdir).expanduser()
with col2:
st.write(srcdir)
if srcdir.name.endswith("json"):
st.write("Loading single file")
try:
data = json.loads(srcdir.read_text())
tbl_raw_data[srcdir.name] = data
process_table(srcdir.name, data)
except json.decode.JSONDecodeError:
st.toast(f"{srcdir.name} can not be read.")
else:
files = list(srcdir.glob("*.json"))
filenames = {f.name for f in files}
is_valid_path = filenames & table_files
if not is_valid_path:
with col2:
st.error("No known tables found at this path")
for f in files:
tbl_name = f.name
if tbl_name in ignored_table_files:
continue
try:
data = json.loads(f.read_text())
tbl_raw_data[tbl_name] = data
process_table(tbl_name, data)
except json.decoder.JSONDecodeError:
st.toast(f"{tbl_name} can not be read.")
if table_source:
st.markdown(
"""
# Tables - Frequencies - Variables
## Metrics
"""
)
cols = st.columns(3)
with cols[0]:
st.metric("Tables", len(tbls))
with cols[1]:
st.metric("Frequencies", len(frequencies))
with cols[2]:
st.metric("Variables", len(var_to_tbl))
with st.expander("Ignored tables"):
st.table(sorted(ignored_table_files))
st.divider()
variables = sorted(var_to_tbl)
var_references = defaultdict(set)
for vname, items in var_to_tbl.items():
var_references[len(items)].add(vname)
var_references = {
counts: sorted(vnames) for counts, vnames in var_references.items()
}
if var_to_tbl:
st.markdown("## Variables")
if var_references and len(var_references) > 1:
filtered_variables = st.checkbox(
"Filter variable list by number of references to tables"
)
if filtered_variables:
counts = st.select_slider(
"Number of references", options=sorted(var_references)
)
variables = var_references[counts]
varname = st.selectbox(
f"Select Variable (count: {len(variables)})", variables, index=None
)
if varname:
show_selected_variable(varname)