Percona MySQL线程池的问题

来源：这里教程网时间：2026-03-01 17:07:45 作者：

使用percona mysql线程池的数据库总是存在下面的链接

 3387111680 | unauthenticated user | connecting host      | NULL                          | Connect          |      0 | login

分析下代码，mysql启动后，进入链接处理，使用process_new_connection处理新的链接

bool init_connection_acceptor()
{
return m_listener->setup_listener();
}
  /**
    Connection acceptor loop to accept connections from clients.
  */
void connection_event_loop()
{
Connection_handler_manager *mgr= Connection_handler_manager::get_instance();
while (!abort_loop)
{
Channel_info *channel_info= m_listener->listen_for_connection_event();
if (channel_info != NULL)
mgr->process_new_connection(channel_info);
}
}

process_new_connection调用不同handler的add_connection 添加链接

if (handler->add_connection(channel_info))
{
inc_aborted_connects();
delete channel_info;
}

下面是连接池的处理，先创建线程，根据线程分配链接，根据线程id 对group_count取余计算对应的线程组，然后将链接放到组的队列中

bool Thread_pool_connection_handler::add_connection(Channel_info *channel_info)
{
DBUG_ENTER("Thread_pool_connection_handler::add_connection");
THD* thd= channel_info->create_thd();
if (unlikely(!thd))
{
channel_info->send_error_and_close_channel(ER_OUT_OF_RESOURCES, 0, false);
DBUG_RETURN(true);
}
connection_t *connection= alloc_connection(thd);
if (unlikely(!connection))
{
thd->get_protocol_classic()->end_net();
delete thd;
channel_info->send_error_and_close_channel(ER_OUT_OF_RESOURCES, 0, false);
DBUG_RETURN(true);
}
delete channel_info;
thd->set_new_thread_id();
thd->start_utime= thd->thr_create_utime= my_micro_time();
thd->scheduler= &tp_event_functions;
Global_THD_manager::get_instance()->add_thd(thd);
thd->event_scheduler.data= connection;
  /* Assign connection to a group. */
thread_group_t *group= &all_groups[thd->thread_id() % group_count];
connection->thread_group=group;
mysql_mutex_lock(&group->mutex);
group->connection_count++;
mysql_mutex_unlock(&group->mutex);
  /*
    Add connection to the work queue. Actual login
    will be done by a worker thread.
  */
queue_put(group, connection);
DBUG_RETURN(false);
}

在queue_put中如果组所有的线程都不是活跃的，那么唤醒或创建一个线程，在创建worker的时候有限流，里面针对连接限流的处理不是特别好，根据线程组中线程数据，返回不通的时间，越大返回的时间越长，

static ulonglong microsecond_throttling_interval(thread_group_t *thread_group)
{
int count= thread_group->thread_count;
if (count < 4)
return 0;
if (count < 8)
return 50*1000;
if(count < 16)
return 100*1000;
return 200*1000;
}

。在每次创建worker的时候，调用worker_main，这个worker会进去不停的循环中，使用get_event处理 event, get_event中在线程组的队列中获取链接，进行处理。在get_event中，会判断是否有太多活跃的线程，如果是有太多活跃的线程，那么将当前线程放入等待列表。如果获取到了链接，如果当前线程有太多的繁忙的链接，那么将链接放入低优先级的队列中。在获取链接的时候，先在高优先级的队列获取链接，没有就在低优先级队列获取。判断连接是否是高优先级，根据mode以及锁

inline bool connection_is_high_prio(const connection_t *c)
{
const ulong mode= c->thd->variables.threadpool_high_prio_mode;
return (mode == TP_HIGH_PRIO_MODE_STATEMENTS) ||
(mode == TP_HIGH_PRIO_MODE_TRANSACTIONS && c->tickets > 0 &&
(thd_is_transaction_active(c->thd) ||
c->thd->variables.option_bits & OPTION_TABLE_LOCK ||
c->thd->locked_tables_mode != LTM_NONE ||
c->thd->mdl_context.has_locks() ||
c->thd->global_read_lock.is_acquired() ||
c->thd->backup_tables_lock.is_acquired() ||
c->thd->backup_binlog_lock.is_acquired() ||
c->thd->mdl_context.has_locks(MDL_key::USER_LEVEL_LOCK) ||
c->thd->mdl_context.has_locks(MDL_key::LOCKING_SERVICE)));
}

队列中分了高低优先级

connection_queue_t queue;
connection_queue_t high_prio_queue;

没有活跃线程的时候，唤醒创建worker，先尝试唤醒，唤醒不行，如果没有活跃的就创建worker，唤醒不行，有些活跃的，会限流创建。如果线程池中的线程都太忙，要等待。唤醒是在等待队列中取出线程。所以如果系统比较繁忙，连接会等待，check_stall时候，或者刚开始来大量连接，如果系统没有活跃的线程，突然来了很多连接，那么创建的速度会被限流。 check_stall会唤醒或创建线程。 ----线程池中的等待是否有超时？一直没有被释放?能查看等待线程的数量？线程池中的等待，有3种 lock,sync,net，这几个等待的调度是在connection_handler_manager.cc中出现的。超时的处理中，根据connection中的abs_timeout判断，超时就kill掉

connection_t *connection= (connection_t *)thd->event_scheduler.data;
if (!connection)
{
      /*
        Connection does not have scheduler data. This happens for example
        if THD belongs to a different scheduler, that is listening to
        extra_port.
      */
return;
}
if(connection->abs_wait_timeout < m_timer->current_microtime)
{
      /* Wait timeout exceeded, kill connection. */
mysql_mutex_lock(&thd->LOCK_thd_data);
thd->killed = THD::KILL_CONNECTION;
tp_post_kill_notification(thd);
mysql_mutex_unlock(&thd->LOCK_thd_data);
}

超时时间的设置，超时时间的设置是net_wait_timeout，kill_idle_transaction_timeout, threadpool_stall_limit组成

c->abs_wait_timeout= pool_timer.current_microtime +
1000LL*pool_timer.tick_interval +
1000000LL*c->thd->get_wait_timeout();
pool_timer.tick_interval= threadpool_stall_limit;
inline ulong get_wait_timeout(void) const
{
if (in_active_multi_stmt_transaction()
&& kill_idle_transaction_timeout > 0
&& kill_idle_transaction_timeout < variables.net_wait_timeout)
return kill_idle_transaction_timeout;
return variables.net_wait_timeout;
}

这个timeout是wokre每次处理完handle_event才进行的设置，那么处于等待状态的线程是没有设置timeout的，如果出现长时间的等待，这种连接也应该kill掉，会有一直处于等待的连接在。出现很多线程等待的时候，从锁，io，并发请求量3个方面进行排查。所以出现上面的等待连接的问题，先看下是不是有慢sql，然后看看线程池设置，有很多慢sql,将线程使用满，导致后续连接处于等待状态。在add_connection方法中，所有的线程都是放入thread_group队列中的，没有区分后台线程以及dump线程。所以设置参数的时候需要注意。线上案例

Id	User	Host	db	Command	Time	State	Info	Rows_sent	Rows_examined
472275	replica	xxx:37962	NULL	Binlog Dump GTID	20241	Master has sent all binlog to slave; waiting for more updates	NULL	0	0
472276	replica	xx:59468	NULL	Binlog Dump GTID	20241	Master has sent all binlog to slave; waiting for more updates	NULL	0	0
472277	replica	xx:35198	NULL	Binlog Dump GTID	20241	Master has sent all binlog to slave; waiting for more updates	NULL	0	0
472278	replica	xx:51560	NULL	Binlog Dump GTID	20241	Master has sent all binlog to slave; waiting for more updates	NULL	0	0
617291	aa	xxx:43378	bb	Query	468	Sending data	SELECT `ccc`.`id` AS `ccc_id`, `ccc`.`metri	1	0
626417	aa	xxx:34124	bb	Query	540	Sending data	SELECT `ccc`.`id` AS `ccc_id`, `ccc`.`metri	1	0
628343	aa	xxx:53988	bb	Query	540	Sending data	SELECT `ccc`.`id` AS `ccc_id`, `ccc`.`metri	1	0
628381	aa	xxx:41336	bb	Query	477	Sending data	SELECT `ccc`.`id` AS `ccc_id`, `ccc`.`metri	1	0
628749	aa	xxx:42270	bb	Query	537	Sending data	SELECT `ccc`.`id` AS `ccc_id`, `ccc`.`metri	1	0
628786	aa	xxx:55506	bb	Query	530	Sending data	SELECT `ccc`.`id` AS `ccc_id`, `ccc`.`metri	1	0
629378	aa	xxx:43838	bb	Query	479	Sending data	SELECT `ccc`.`id` AS `ccc_id`, `ccc`.`metri	1	0
629384	aa	xxx:56468	bb	Query	479	Sending data	SELECT `ccc`.`id` AS `ccc_id`, `ccc`.`metri	1	0
634497	aa	xxx:33786	bb	Query	37	Sending data	SELECT COUNT(1) AS `cnt` FROM `zz` `ccc` WHERE `Metric	0	0
635301	aa	xx:44884	bb	Execute	12	Sending data	SELECT id, zz as ff, ee, dd, cc FROM `zz	0	0
637553	qq	xx:52912	NULL	Query	0	starting	show processlist

看到慢sql以及binlog dump线程一共15，线程池设置| thread_pool_oversubscribe | 7 || thread_pool_size | 2 | 一共能使用的线程设置成了14个正好满足too_many_active_threads的设置，这个时候就开始等待了