分层数据#

Bokeh 没有任何专门用于处理分层数据的内置 API,但可以将 Bokeh 的基本组件与其他库结合使用以处理许多情况。下面描述了一些示例。

树状图#

树状图提供了一种查看分层数据的方式,有助于突出显示模式,例如销售数据中最大或最小的销售商。树分支由矩形表示,子分支由更小的嵌套矩形表示。

下面的示例演示了如何使用 Bokeh 的 block() 函数以及第三方 Squarify 库创建树状图。

import pandas as pd
from squarify import normalize_sizes, squarify

from bokeh.plotting import figure, show
from bokeh.sampledata.sample_superstore import data
from bokeh.transform import factor_cmap

data = data[["City", "Region", "Sales"]]

regions = ("West", "Central", "South", "East")

sales_by_city = data.groupby(["Region", "City"]).sum("Sales")
sales_by_city = sales_by_city.sort_values(by="Sales").reset_index()

sales_by_region = sales_by_city.groupby("Region").sum("Sales").sort_values(by="Sales")

def treemap(df, col, x, y, dx, dy, *, N=100):
    sub_df = df.nlargest(N, col)
    normed = normalize_sizes(sub_df[col], dx, dy)
    blocks = squarify(normed, x, y, dx, dy)
    blocks_df = pd.DataFrame.from_dict(blocks).set_index(sub_df.index)
    return sub_df.join(blocks_df, how='left').reset_index()

x, y, w, h = 0, 0, 800, 450

blocks_by_region = treemap(sales_by_region, "Sales", x, y, w, h)

dfs = []
for index, (Region, Sales, x, y, dx, dy) in blocks_by_region.iterrows():
    df = sales_by_city[sales_by_city.Region==Region]
    dfs.append(treemap(df, "Sales", x, y, dx, dy, N=10))
blocks = pd.concat(dfs)

p = figure(width=w, height=h, tooltips="@City", toolbar_location=None,
           x_axis_location=None, y_axis_location=None)
p.x_range.range_padding = p.y_range.range_padding = 0
p.grid.grid_line_color = None

p.block('x', 'y', 'dx', 'dy', source=blocks, line_width=1, line_color="white",
        fill_alpha=0.8, fill_color=factor_cmap("Region", "MediumContrast4", regions))

p.text('x', 'y', x_offset=2, text="Region", source=blocks_by_region,
       text_font_size="18pt", text_color="white")

blocks["ytop"] = blocks.y + blocks.dy
p.text('x', 'ytop', x_offset=2, y_offset=2, text="City", source=blocks,
       text_font_size="6pt", text_baseline="top",
       text_color=factor_cmap("Region", ("black", "white", "black", "white"), regions))

show(p)

交叉表#

交叉表(即“crosstabs”)也显示了整体各个部分之间以及彼此之间的关系。下面的示例显示了应用于样本超级商店数据交叉表的相邻条形图。由于更广泛的样式和内联标签,此示例更复杂。

import pandas as pd

from bokeh.core.properties import value
from bokeh.plotting import ColumnDataSource, figure, show
from bokeh.sampledata.sample_superstore import data as df
from bokeh.transform import cumsum, factor_cmap

rows = pd.crosstab(df.Category, df.Region, aggfunc='sum', values=df.Sales, normalize="index")

source = ColumnDataSource(rows.T)

cats = ["Office Supplies", "Furniture", "Technology"]
regions = source.data["Region"]

p = figure(y_range=cats, x_range=(-0.55, 1.02), height=400, width=700, tools="",
           x_axis_location=None, toolbar_location=None, outline_line_color=None)
p.grid.grid_line_color = None
p.yaxis.fixed_location = 0
p.axis.major_tick_line_color = None
p.axis.major_label_text_color = None
p.axis.axis_line_color = "#4a4a4a"
p.axis.axis_line_width = 6

source.data["color"] = [ "#dadada","#dadada", "#4a4a4a", "#dadada"]
for y in cats:
    left, right = cumsum(y, include_zero=True), cumsum(y)

    p.hbar(y=value(y), left=left, right=right, source=source, height=0.9,
           color=factor_cmap("Region", "MediumContrast4", regions))

    pcts = source.data[y]
    source.data[f"{y} text"] = [f"{r}\n{x*100:0.1f}%" for r, x in zip(regions, pcts)]

    p.text(y=value(y), x=left, text=f"{y} text", source=source, x_offset=10,
           text_color="color", text_baseline="middle", text_font_size="15px")

totals = pd.crosstab(df.Category, df.Region, margins=True, aggfunc='sum',
                     values=df.Sales, normalize="columns").All

p.hbar(right=0, left=-totals, y=totals.index, height=0.9, color="#dadada")

text = [f"{name} ({totals.loc[name]*100:0.1f}%)" for name in cats]
p.text(y=cats, x=0, text=text, text_baseline="middle", text_align="right",
       x_offset=-12, text_color="#4a4a4a", text_font_size="20px",
       text_font_style="bold")

show(p)

数据立方体#

以后的章节将更详细地介绍交互和部件,但这里值得一提的是,Bokeh 确实有一个专门用于呈现分层数据视图的部件。下面显示了使用 DataCube 的简单示例。

from bokeh.io import show
from bokeh.models import (ColumnDataSource, DataCube, GroupingInfo,
                          StringFormatter, SumAggregator, TableColumn)

source = ColumnDataSource(data=dict(
    d0=['A', 'E', 'E', 'E', 'J', 'L', 'M'],
    d1=['B', 'D', 'D', 'H', 'K', 'L', 'N'],
    d2=['C', 'F', 'G', 'H', 'K', 'L', 'O'],
    px=[10, 20, 30, 40, 50, 60, 70],
))

target = ColumnDataSource(data=dict(row_indices=[], labels=[]))

formatter = StringFormatter(font_style='bold')

columns = [
    TableColumn(field='d2', title='Name', width=80, sortable=False, formatter=formatter),
    TableColumn(field='px', title='Price', width=40, sortable=False),
]

grouping = [
    GroupingInfo(getter='d0', aggregators=[SumAggregator(field_='px')]),
    GroupingInfo(getter='d1', aggregators=[SumAggregator(field_='px')]),
]

cube = DataCube(source=source, columns=columns, grouping=grouping, target=target)

show(cube)