Skip to content

QQ: Avoid secondary process when repairing leader record. #10193

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 27, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 52 additions & 25 deletions deps/rabbit/src/rabbit_quorum_queue.erl
Original file line number Diff line number Diff line change
Expand Up @@ -230,18 +230,23 @@ start_cluster(Q) ->
{error, {too_long, N}} ->
rabbit_data_coercion:to_atom(ra:new_uid(N))
end,
{Leader, Followers} = rabbit_queue_location:select_leader_and_followers(Q, QuorumSize),
LeaderId = {RaName, Leader},
{LeaderNode, FollowerNodes} =
rabbit_queue_location:select_leader_and_followers(Q, QuorumSize),
LeaderId = {RaName, LeaderNode},
NewQ0 = amqqueue:set_pid(Q, LeaderId),
NewQ1 = amqqueue:set_type_state(NewQ0, #{nodes => [Leader | Followers]}),
NewQ1 = amqqueue:set_type_state(NewQ0,
#{nodes => [LeaderNode | FollowerNodes]}),

rabbit_log:debug("Will start up to ~w replicas for quorum ~ts with leader on node '~ts'",
[QuorumSize, rabbit_misc:rs(QName), Leader]),
[QuorumSize, rabbit_misc:rs(QName), LeaderNode]),
case rabbit_amqqueue:internal_declare(NewQ1, false) of
{created, NewQ} ->
RaConfs = [make_ra_conf(NewQ, ServerId)
|| ServerId <- members(NewQ)],
try erpc_call(Leader, ra, start_cluster,

%% khepri projections on remote nodes are eventually consistent
wait_for_projections(LeaderNode, QName),
try erpc_call(LeaderNode, ra, start_cluster,
[?RA_SYSTEM, RaConfs, ?START_CLUSTER_TIMEOUT],
?START_CLUSTER_RPC_TIMEOUT) of
{ok, _, _} ->
Expand All @@ -266,10 +271,10 @@ start_cluster(Q) ->
ActingUser}]),
{new, NewQ};
{error, Error} ->
declare_queue_error(Error, NewQ, Leader, ActingUser)
declare_queue_error(Error, NewQ, LeaderNode, ActingUser)
catch
error:Error ->
declare_queue_error(Error, NewQ, Leader, ActingUser)
declare_queue_error(Error, NewQ, LeaderNode, ActingUser)
end;
{existing, _} = Ex ->
Ex
Expand Down Expand Up @@ -359,26 +364,28 @@ local_or_remote_handler(ChPid, Module, Function, Args) ->
end.

become_leader(QName, Name) ->
%% as this function is called synchronously when a ra node becomes leader
%% we need to ensure there is no chance of blocking as else the ra node
%% may not be able to establish its leadership
spawn(fun () -> become_leader0(QName, Name) end).

become_leader0(QName, Name) ->
Fun = fun (Q1) ->
amqqueue:set_state(
amqqueue:set_pid(Q1, {Name, node()}),
live)
end,
%% as this function is called synchronously when a ra node becomes leader
%% we need to ensure there is no chance of blocking as else the ra node
%% may not be able to establish its leadership
spawn(fun() ->
_ = rabbit_amqqueue:update(QName, Fun),
case rabbit_amqqueue:lookup(QName) of
{ok, Q0} when ?is_amqqueue(Q0) ->
Nodes = get_nodes(Q0),
[_ = erpc_call(Node, ?MODULE, rpc_delete_metrics,
[QName], ?RPC_TIMEOUT)
|| Node <- Nodes, Node =/= node()];
_ ->
ok
end
end).
_ = rabbit_amqqueue:update(QName, Fun),
case rabbit_amqqueue:lookup(QName) of
{ok, Q0} when ?is_amqqueue(Q0) ->
Nodes = get_nodes(Q0),
_ = [_ = erpc_call(Node, ?MODULE, rpc_delete_metrics,
[QName], ?RPC_TIMEOUT)
|| Node <- Nodes, Node =/= node()],
ok;
_ ->
ok
end.

-spec all_replica_states() -> {node(), #{atom() => atom()}}.
all_replica_states() ->
Expand Down Expand Up @@ -550,7 +557,7 @@ handle_tick(QName,
catch
_:Err ->
rabbit_log:debug("~ts: handle tick failed with ~p",
[rabbit_misc:rs(QName), Err]),
[rabbit_misc:rs(QName), Err]),
ok
end
end).
Expand All @@ -566,7 +573,7 @@ repair_leader_record(QName, Self) ->
rabbit_log:debug("~ts: repairing leader record",
[rabbit_misc:rs(QName)]),
{_, Name} = erlang:process_info(Self, registered_name),
become_leader(QName, Name),
ok = become_leader0(QName, Name),
ok
end,
ok.
Expand Down Expand Up @@ -633,7 +640,7 @@ recover(_Vhost, Queues) ->
Err1 == name_not_registered ->
rabbit_log:warning("Quorum queue recovery: configured member of ~ts was not found on this node. Starting member as a new one. "
"Context: ~s",
[rabbit_misc:rs(QName), Err1]),
[rabbit_misc:rs(QName), Err1]),
% queue was never started on this node
% so needs to be started from scratch.
case start_server(make_ra_conf(Q0, ServerId)) of
Expand Down Expand Up @@ -1901,3 +1908,23 @@ force_all_queues_shrink_member_to_current_member() ->
is_minority(All, Up) ->
MinQuorum = length(All) div 2 + 1,
length(Up) < MinQuorum.

wait_for_projections(Node, QName) ->
case rabbit_feature_flags:is_enabled(khepri_db) andalso
Node =/= node() of
true ->
wait_for_projections(Node, QName, 256);
false ->
ok
end.

wait_for_projections(Node, QName, 0) ->
exit({wait_for_projections_timed_out, Node, QName});
wait_for_projections(Node, QName, N) ->
case erpc_call(Node, rabbit_amqqueue, lookup, [QName], 100) of
{ok, _} ->
ok;
_ ->
timer:sleep(100),
wait_for_projections(Node, QName, N - 1)
end.
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy