分层数据#
Bokeh 没有任何专门用于处理分层数据的内置 API,但可以将 Bokeh 的基本组件与其他库结合使用以处理许多情况。下面描述了一些示例。
树状图#
树状图提供了一种查看分层数据的方式,有助于突出显示模式,例如销售数据中最大或最小的销售商。树分支由矩形表示,子分支由更小的嵌套矩形表示。
下面的示例演示了如何使用 Bokeh 的 block()
函数以及第三方 Squarify 库创建树状图。
import pandas as pd
from squarify import normalize_sizes, squarify
from bokeh.plotting import figure, show
from bokeh.sampledata.sample_superstore import data
from bokeh.transform import factor_cmap
data = data[["City", "Region", "Sales"]]
regions = ("West", "Central", "South", "East")
sales_by_city = data.groupby(["Region", "City"]).sum("Sales")
sales_by_city = sales_by_city.sort_values(by="Sales").reset_index()
sales_by_region = sales_by_city.groupby("Region").sum("Sales").sort_values(by="Sales")
def treemap(df, col, x, y, dx, dy, *, N=100):
sub_df = df.nlargest(N, col)
normed = normalize_sizes(sub_df[col], dx, dy)
blocks = squarify(normed, x, y, dx, dy)
blocks_df = pd.DataFrame.from_dict(blocks).set_index(sub_df.index)
return sub_df.join(blocks_df, how='left').reset_index()
x, y, w, h = 0, 0, 800, 450
blocks_by_region = treemap(sales_by_region, "Sales", x, y, w, h)
dfs = []
for index, (Region, Sales, x, y, dx, dy) in blocks_by_region.iterrows():
df = sales_by_city[sales_by_city.Region==Region]
dfs.append(treemap(df, "Sales", x, y, dx, dy, N=10))
blocks = pd.concat(dfs)
p = figure(width=w, height=h, tooltips="@City", toolbar_location=None,
x_axis_location=None, y_axis_location=None)
p.x_range.range_padding = p.y_range.range_padding = 0
p.grid.grid_line_color = None
p.block('x', 'y', 'dx', 'dy', source=blocks, line_width=1, line_color="white",
fill_alpha=0.8, fill_color=factor_cmap("Region", "MediumContrast4", regions))
p.text('x', 'y', x_offset=2, text="Region", source=blocks_by_region,
text_font_size="18pt", text_color="white")
blocks["ytop"] = blocks.y + blocks.dy
p.text('x', 'ytop', x_offset=2, y_offset=2, text="City", source=blocks,
text_font_size="6pt", text_baseline="top",
text_color=factor_cmap("Region", ("black", "white", "black", "white"), regions))
show(p)
交叉表#
交叉表(即“crosstabs”)也显示了整体各个部分之间以及彼此之间的关系。下面的示例显示了应用于样本超级商店数据交叉表的相邻条形图。由于更广泛的样式和内联标签,此示例更复杂。
import pandas as pd
from bokeh.core.properties import value
from bokeh.plotting import ColumnDataSource, figure, show
from bokeh.sampledata.sample_superstore import data as df
from bokeh.transform import cumsum, factor_cmap
rows = pd.crosstab(df.Category, df.Region, aggfunc='sum', values=df.Sales, normalize="index")
source = ColumnDataSource(rows.T)
cats = ["Office Supplies", "Furniture", "Technology"]
regions = source.data["Region"]
p = figure(y_range=cats, x_range=(-0.55, 1.02), height=400, width=700, tools="",
x_axis_location=None, toolbar_location=None, outline_line_color=None)
p.grid.grid_line_color = None
p.yaxis.fixed_location = 0
p.axis.major_tick_line_color = None
p.axis.major_label_text_color = None
p.axis.axis_line_color = "#4a4a4a"
p.axis.axis_line_width = 6
source.data["color"] = [ "#dadada","#dadada", "#4a4a4a", "#dadada"]
for y in cats:
left, right = cumsum(y, include_zero=True), cumsum(y)
p.hbar(y=value(y), left=left, right=right, source=source, height=0.9,
color=factor_cmap("Region", "MediumContrast4", regions))
pcts = source.data[y]
source.data[f"{y} text"] = [f"{r}\n{x*100:0.1f}%" for r, x in zip(regions, pcts)]
p.text(y=value(y), x=left, text=f"{y} text", source=source, x_offset=10,
text_color="color", text_baseline="middle", text_font_size="15px")
totals = pd.crosstab(df.Category, df.Region, margins=True, aggfunc='sum',
values=df.Sales, normalize="columns").All
p.hbar(right=0, left=-totals, y=totals.index, height=0.9, color="#dadada")
text = [f"{name} ({totals.loc[name]*100:0.1f}%)" for name in cats]
p.text(y=cats, x=0, text=text, text_baseline="middle", text_align="right",
x_offset=-12, text_color="#4a4a4a", text_font_size="20px",
text_font_style="bold")
show(p)
数据立方体#
以后的章节将更详细地介绍交互和部件,但这里值得一提的是,Bokeh 确实有一个专门用于呈现分层数据视图的部件。下面显示了使用 DataCube
的简单示例。
from bokeh.io import show
from bokeh.models import (ColumnDataSource, DataCube, GroupingInfo,
StringFormatter, SumAggregator, TableColumn)
source = ColumnDataSource(data=dict(
d0=['A', 'E', 'E', 'E', 'J', 'L', 'M'],
d1=['B', 'D', 'D', 'H', 'K', 'L', 'N'],
d2=['C', 'F', 'G', 'H', 'K', 'L', 'O'],
px=[10, 20, 30, 40, 50, 60, 70],
))
target = ColumnDataSource(data=dict(row_indices=[], labels=[]))
formatter = StringFormatter(font_style='bold')
columns = [
TableColumn(field='d2', title='Name', width=80, sortable=False, formatter=formatter),
TableColumn(field='px', title='Price', width=40, sortable=False),
]
grouping = [
GroupingInfo(getter='d0', aggregators=[SumAggregator(field_='px')]),
GroupingInfo(getter='d1', aggregators=[SumAggregator(field_='px')]),
]
cube = DataCube(source=source, columns=columns, grouping=grouping, target=target)
show(cube)