接上节,依然使用上节的用例如下
postgres=# select * from bt_metap('pk_t_index');
magic | version | root | level | fastroot | fastlevel | oldest_xact | last_cleanup_num_tuples
--------+---------+------+-------+----------+-----------+-------------+-------------------------
340322 | 4 | 1 | 0 | 1 | 0 | 0 | -1
(1 row)
postgres=#
可以看到root page id = 1 。
postgres=# select * from bt_page_stats('pk_t_index',1);
blkno | type | live_items | dead_items | avg_item_size | page_size | free_size | btpo_prev | btpo_next | btpo | btpo_flags
-------+------+------------+------------+---------------+-----------+-----------+-----------+-----------+------+------------
1 | l | 4 | 0 | 16 | 8192 | 8068 | 0 | 0 | 0 | 3
(1 row)
postgres=# select * from bt_page_items('pk_t_index',1);
itemoffset | ctid | itemlen | nulls | vars | data
------------+-------+---------+-------+------+-------------------------
1 | (0,1) | 16 | f | f | 02 00 00 00 00 00 00 00
2 | (0,2) | 16 | f | f | 04 00 00 00 00 00 00 00
3 | (0,3) | 16 | f | f | 08 00 00 00 00 00 00 00
4 | (0,4) | 16 | f | f | 10 00 00 00 00 00 00 00
(4 rows)
postgres=# select * from heap_page_items(get_raw_page('pk_t_index',1));
lp | lp_off | lp_flags | lp_len | t_xmin | t_xmax | t_field3 | t_ctid | t_infomask2 | t_infomask | t_hoff | t_bits | t_oid | t_data
----+--------+----------+--------+--------+--------+----------+--------+-------------+------------+--------+--------+-------+--------
1 | 8160 | 1 | 16 | | | | | | | | | |
2 | 8144 | 1 | 16 | | | | | | | | | |
3 | 8128 | 1 | 16 | | | | | | | | | |
4 | 8112 | 1 | 16 | | | | | | | | | |
(4 rows)
postgres=#
postgres=# select * from page_header(get_raw_page('pk_t_index',1));
lsn | checksum | flags | lower | upper | special | pagesize | version | prune_xid
-----------+----------+-------+-------+-------+---------+----------+---------+-----------
0/4D402D0 | 0 | 0 | 40 | 8112 | 8176 | 8192 | 4 | 0
(1 row)
根据root page id = 1查看root page的statsbtpo=0 说明已经到了最底层btpo_flags=3,说明它既是leaf又是root页。btpo_prev和btpo_next分别表示该页的相邻页(branch page是双向链表)。btpo_flags 可以在代码中查看(src/include/access/nbtree.h),一共有几个
typedef struct BTPageOpaqueData
{
BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */
BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */
union
{
uint32 level; /* tree level --- zero for leaf pages */
TransactionId xact; /* next transaction ID, if deleted */
} btpo;
uint16 btpo_flags; /* flag bits, see below */
BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */
} BTPageOpaqueData;
/* Bits defined in btpo_flags */
#define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */
#define BTP_ROOT (1 << 1) /* root page (has no parent) */
#define BTP_DELETED (1 << 2) /* page has been deleted from tree */
#define BTP_META (1 << 3) /* meta-page */
#define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */
#define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */
#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples */
#define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */
BTPageOpaqueData存储在page 1(这里的page 1又是根节点,又是叶节点,还是子节点,所以不好说,是否每个节点页都还包含这个数据结构,在下面的实验中会体现)sepcial空间,如下page1相对整个页面的偏移为8192,而page 1的special空间相对页头偏移为8176,所以相对这个文件偏移为16368
postgres@morro:~$ hexdump -C /var/lib/postgresql/data/pg12/base/12711/46918 -s 16368 -n 16 00003ff0 00 00 00 00 00 00 00 00 00 00 00 00 03 00 00 00 |................| 00004000 postgres@morro:~$
BTPageOpaqueData大小正好为16字节,这里不一一查看每个域的值了,可以看到其他均为0,flags为3.下面查看一下,bt元组的items 先插入两条新元组,先插入id为18,再插入id为17
postgres=# insert into t_index values(18,'4','d');
INSERT 0 1
postgres=# insert into t_index values(17,'4','d');
INSERT 0 1
postgres=#
postgres=# select * from bt_page_items('pk_t_index',1);
itemoffset | ctid | itemlen | nulls | vars | data
------------+-------+---------+-------+------+-------------------------
1 | (0,1) | 16 | f | f | 02 00 00 00 00 00 00 00
2 | (0,2) | 16 | f | f | 04 00 00 00 00 00 00 00
3 | (0,3) | 16 | f | f | 08 00 00 00 00 00 00 00
4 | (0,4) | 16 | f | f | 10 00 00 00 00 00 00 00
5 | (0,6) | 16 | f | f | 11 00 00 00 00 00 00 00
6 | (0,5) | 16 | f | f | 12 00 00 00 00 00 00 00
(6 rows)
postgres=#
可以看到,实际数据存储时是无序的(索性必须是有序的),但是索引的行指针是有序的18对应的ctid为(0,5),17对应的ctid为(0,6),索引17存在18后面,所以是无序的,但是(0,5)也就是18对应的行指针为lp6,(0,6)对应的行指针为lp5,所以行指针是有序的,正确的表示了索引值的大小。通过这个例子,也就是说如果我们插入一个15的,15对应的ctid应该为(0,7)但是对应的行指针应该为lp4,10,17,18对应的行指针应该要后移一步。试一下
postgres=# select * from bt_page_items('pk_t_index',1);
itemoffset | ctid | itemlen | nulls | vars | data
------------+-------+---------+-------+------+-------------------------
1 | (0,1) | 16 | f | f | 02 00 00 00 00 00 00 00
2 | (0,2) | 16 | f | f | 04 00 00 00 00 00 00 00
3 | (0,3) | 16 | f | f | 08 00 00 00 00 00 00 00
4 | (0,7) | 16 | f | f | 0f 00 00 00 00 00 00 00
5 | (0,4) | 16 | f | f | 10 00 00 00 00 00 00 00
6 | (0,6) | 16 | f | f | 11 00 00 00 00 00 00 00
7 | (0,5) | 16 | f | f | 12 00 00 00 00 00 00 00
(7 rows)
postgres=#
结果和我们想的一样。我们再来解析一条itembt元组的头信息如下,长度为8个字节
typedef struct IndexTupleData
{
ItemPointerData t_tid; /* reference TID to heap tuple */
/* ---------------
* t_info is laid out in the following fashion:
*
* //该元组是否为null
* 15th (high) bit: has nulls
* //该元组是否有可变长度的属性
* 14th bit: has var-width attributes
* //access method 定义其意义
* 13th bit: AM-defined meaning
* //元组的大小
* 12-0 bit: size of tuple
* ---------------
*/
unsigned short t_info; /* various info about tuple */
} IndexTupleData;
typedef struct ItemPointerData
{
BlockIdData ip_blkid; //块号
OffsetNumber ip_posid; //块内偏移
}
typedef struct BlockIdData
{
uint16 bi_hi;
uint16 bi_lo;
} BlockIdData;
查看所有bt元组在页内的偏移
postgres=# select * from heap_page_items(get_raw_page('pk_t_index',1));
lp | lp_off | lp_flags | lp_len | t_xmin | t_xmax | t_field3 | t_ctid | t_infomask2 | t_infomask | t_hoff | t_bits | t_oid | t_data
----+--------+----------+--------+--------+--------+----------+--------+-------------+------------+--------+--------+-------+--------
1 | 8160 | 1 | 16 | | | | | | | | | |
2 | 8144 | 1 | 16 | | | | | | | | | |
3 | 8128 | 1 | 16 | | | | | | | | | |
4 | 8064 | 1 | 16 | | | | | | | | | |
5 | 8112 | 1 | 16 | | | | | | | | | |
6 | 8080 | 1 | 16 | | | | | | | | | |
7 | 8096 | 1 | 16 | | | | | | | | | |
(7 rows)
postgres=#
以第一条元组为例,相对于文件头,该元组的偏移为8192+8160,长度为16字节,其中8字节为头,Tcid
postgres@morro:~$ hexdump -C /var/lib/postgresql/data/pg12/base/12711/46918 -s 16352 -n 4 00003fe0 00 00 00 00 |....| 00003fe4 postgres@morro:~$
对应(0, 1)中的0,注意这里的t-cid指向的是堆表的元组位置,在本例中是t_index的page 0 的 1号元组。
postgres@morro:~$ hexdump -C /var/lib/postgresql/data/pg12/base/12711/46918 -s 16356 -n 2 00003fe4 01 00 |..| 00003fe6 postgres@morro:~$
1,对应(0,1)中的1.t_Info
postgres@morro:~$ hexdump -C /var/lib/postgresql/data/pg12/base/12711/46918 -s 16358 -n 2 00003fe6 10 00 |..| 00003fe8 postgres@morro:~$
0X0010,根据上面的解释,表示元组长度为16元组的信息为
postgres=# select * from bt_page_items('pk_t_index',1);
itemoffset | ctid | itemlen | nulls | vars | data
------------+-------+---------+-------+------+-------------------------
1 | (0,1) | 16 | f | f | 02 00 00 00 00 00 00 00
2 | (0,2) | 16 | f | f | 04 00 00 00 00 00 00 00
3 | (0,3) | 16 | f | f | 08 00 00 00 00 00 00 00
4 | (0,7) | 16 | f | f | 0f 00 00 00 00 00 00 00
5 | (0,4) | 16 | f | f | 10 00 00 00 00 00 00 00
6 | (0,6) | 16 | f | f | 11 00 00 00 00 00 00 00
7 | (0,5) | 16 | f | f | 12 00 00 00 00 00 00 00
(7 rows)
postgres=#
查看第一条元组的数据
postgres@morro:~$ hexdump -C /var/lib/postgresql/data/pg12/base/12711/46918 -s 16360 -n 8 00003fe8 02 00 00 00 00 00 00 00 |........| 00003ff0 postgres@morro:~$
根据源码注释如下
/*
* Index tuple header structure
*
* All index tuples start with IndexTupleData. If the HasNulls bit is set,
* this is followed by an IndexAttributeBitMapData. The index attribute
* values follow, beginning at a MAXALIGN boundary.
*
* Note that the space allocated for the bitmap does not vary with the number
* of attributes; that is because we don't have room to store the number of
* attributes in the header. Given the MAXALIGN constraint there's no space
* savings to be had anyway, for usual values of INDEX_MAX_KEYS.
*/
typedef struct IndexTupleData
{
ItemPointerData t_tid; /* reference TID to heap tuple */
/* ---------------
* t_info is laid out in the following fashion:
*
* //该元组是否为null
* 15th (high) bit: has nulls
* //该元组是否有可变长度的属性
* 14th bit: has var-width attributes
* //access method 定义其意义
* 13th bit: AM-defined meaning
* //元组的大小
* 12-0 bit: size of tuple
* ---------------
*/
unsigned short t_info; /* various info about tuple */
} IndexTupleData;
typedef struct IndexAttributeBitMapData
{
bits8 bits[(INDEX_MAX_KEYS + 8 - 1) / 8];
} IndexAttributeBitMapData;
所以如果元组为null,才会跟上 IndexAttributeBitMapData
