Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5d2ef9fd2e | |||
|
5b026593ce
|
@@ -21,7 +21,7 @@ steps:
|
||||
- bao kv get -mount secret -field RENOVATE_TOKEN renovate > /woodpecker/renovate_token
|
||||
- bao kv get -mount secret -field GITHUB_COM_TOKEN renovate > /woodpecker/github_com_token
|
||||
- name: Run Renovate
|
||||
image: renovate/renovate:43.196.0
|
||||
image: renovate/renovate:43.197.0
|
||||
environment:
|
||||
RENOVATE_AUTODISCOVER: "true"
|
||||
RENOVATE_ENDPOINT: https://gitea.lumpiasty.xyz/api/v1
|
||||
|
||||
@@ -10,10 +10,6 @@ openwrt_mgmt_ip: 192.168.255.11
|
||||
openwrt_mgmt_prefix: 24
|
||||
openwrt_mgmt_gateway: 192.168.255.10
|
||||
|
||||
# DNS servers for the AP itself
|
||||
openwrt_dns_servers:
|
||||
- 192.168.0.1
|
||||
|
||||
# SSH authorised keys (list of public key strings)
|
||||
openwrt_ssh_authorized_keys: []
|
||||
|
||||
@@ -26,4 +22,6 @@ openwrt_ntp_servers:
|
||||
openwrt_packages:
|
||||
- usb-modeswitch # switches embedded LTE modem (Qualcomm 05c6:9008) from EDL to QMI mode on boot
|
||||
- luci-proto-qmi # adds QMI protocol support to LuCI for configuring the embedded LTE modem
|
||||
- bird2 # BGP daemon — peers with CRS for LTE failover route signalling
|
||||
- bird2c # Control CLI interface for BGP daemon
|
||||
|
||||
|
||||
@@ -12,3 +12,8 @@
|
||||
- name: Reload wireless
|
||||
community.openwrt.command:
|
||||
cmd: wifi reload
|
||||
|
||||
- name: Reload bird
|
||||
community.openwrt.service:
|
||||
name: bird
|
||||
state: restarted
|
||||
|
||||
@@ -0,0 +1,153 @@
|
||||
---
|
||||
# Configures BIRD2 on the D-Link as an iBGP peer of the MikroTik CRS418.
|
||||
#
|
||||
# Route exchange:
|
||||
# D-Link → CRS: announces 0.0.0.0/0 and 2000::/3 when wwan0 is up.
|
||||
# CRS installs these at BGP distance 200 (below the GPON
|
||||
# static default at distance 1 — activates only on GPON failure).
|
||||
#
|
||||
# CRS → D-Link: announces connected routes (VLAN subnets), static routes
|
||||
# (Tailscale, GPON default), and reflects k8s BGP routes.
|
||||
# BIRD2 installs all of these into the kernel at metric 10.
|
||||
#
|
||||
# D-Link's own routing:
|
||||
# - Kernel metric 10: BGP-learned routes from CRS (preferred)
|
||||
# - Kernel metric 100: wwan QMI-assigned routes (fallback)
|
||||
# No static default gateway on uplink — the default comes from BGP.
|
||||
# When GPON fails, CRS withdraws the BGP default; D-Link falls back to wwan.
|
||||
|
||||
- name: Write BIRD2 configuration
|
||||
community.openwrt.copy:
|
||||
dest: /etc/bird.conf
|
||||
mode: '0640'
|
||||
owner: root
|
||||
group: root
|
||||
content: |
|
||||
# BIRD2 — LTE failover BGP peer for MikroTik CRS418
|
||||
# iBGP session, AS 65000, peer: 192.168.6.1 (CRS vlan6)
|
||||
|
||||
router id 192.168.6.2;
|
||||
|
||||
protocol device {
|
||||
# Tracks interface up/down state via netlink.
|
||||
# scan time is a periodic reconciliation fallback; real events are
|
||||
# netlink-driven and processed immediately.
|
||||
scan time 5;
|
||||
}
|
||||
|
||||
# Announce directly connected prefixes into BIRD2's RIB so that
|
||||
# next-hop resolution works for BGP routes received from CRS.
|
||||
# Without this, 192.168.6.1 (CRS uplink) is unresolvable and all
|
||||
# IPv4 BGP routes appear unreachable. Same for IPv6 uplink prefix.
|
||||
protocol direct {
|
||||
ipv4;
|
||||
ipv6;
|
||||
interface "eth0.6";
|
||||
}
|
||||
|
||||
# Install BGP-learned routes from CRS into the kernel at metric 10.
|
||||
# This is lower than the wwan QMI default (metric 100), so D-Link
|
||||
# prefers the CRS path for its own outbound traffic when GPON is up.
|
||||
# import none: BIRD2 does not read the kernel table, preventing
|
||||
# wwan kernel routes from leaking into BGP.
|
||||
protocol kernel k4 {
|
||||
ipv4 {
|
||||
import none;
|
||||
export filter {
|
||||
if proto = "crs" then {
|
||||
krt_metric = 10;
|
||||
accept;
|
||||
}
|
||||
reject;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
protocol kernel k6 {
|
||||
ipv6 {
|
||||
import none;
|
||||
export filter {
|
||||
if proto = "crs" then {
|
||||
krt_metric = 10;
|
||||
accept;
|
||||
}
|
||||
reject;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
# LTE default routes — exist only while wwan0 is up.
|
||||
# BIRD2's device protocol tracks wwan0 via netlink; when the interface
|
||||
# goes down the routes become unreachable and BGP withdraws them.
|
||||
# Uses interface-name routing (no explicit gateway IP) which is correct
|
||||
# for QMI raw-ip POINTOPOINT NOARP interfaces.
|
||||
#
|
||||
# Preference 50 is below BGP's default of 100 — these routes are only
|
||||
# used by BIRD2 internally as a presence signal for BGP export, NOT for
|
||||
# installing into the kernel as our active default route. The kernel
|
||||
# already gets the wwan default at metric 100 via netifd/qmi.sh, and
|
||||
# we want the BGP-learned default via CRS (kernel metric 10) to be
|
||||
# preferred for D-Link's own outbound traffic when GPON is up.
|
||||
protocol static lte_default {
|
||||
ipv4 {
|
||||
preference 50;
|
||||
};
|
||||
route 0.0.0.0/0 via "wwan0";
|
||||
}
|
||||
|
||||
protocol static lte_default6 {
|
||||
ipv6 {
|
||||
preference 50;
|
||||
};
|
||||
route 2000::/3 via "wwan0";
|
||||
}
|
||||
|
||||
protocol bgp crs {
|
||||
description "MikroTik CRS418 — LTE failover signalling";
|
||||
local 192.168.6.2 as 65000;
|
||||
neighbor 192.168.6.1 as 65000;
|
||||
hold time 30;
|
||||
keepalive time 10;
|
||||
|
||||
ipv4 {
|
||||
# Import all prefixes CRS announces (VLAN subnets, static routes,
|
||||
# k8s BGP routes reflected via RR). Installed into kernel via k4.
|
||||
import all;
|
||||
# Export only the wwan-sourced LTE default route.
|
||||
# BGP-learned CRS routes are never re-exported (iBGP split-horizon
|
||||
# applies; BIRD2 also does not import CRS routes into its RIB from
|
||||
# the kernel, so they cannot appear here).
|
||||
export where proto = "lte_default";
|
||||
};
|
||||
|
||||
ipv6 {
|
||||
# CRS uses Extended Next Hop (RFC 5549) for IPv6 routes, advertising
|
||||
# them with the IPv4 next-hop 192.168.6.1. The Linux kernel cannot
|
||||
# install IPv6 routes with IPv4 next-hops. Accept the routes from BGP
|
||||
# (we negotiated ENHE via "extended next hop yes") but rewrite the
|
||||
# next-hop in the import filter to the CRS's native IPv6 address on
|
||||
# vlan6 before they reach the kernel.
|
||||
extended next hop yes;
|
||||
import filter {
|
||||
gw = 2001:470:61a3:600::1;
|
||||
accept;
|
||||
};
|
||||
# Force our own native IPv6 address as the next-hop when advertising
|
||||
# to CRS, otherwise BIRD2 also uses ENHE and CRS receives a route
|
||||
# with ::ffff:192.168.6.2 which it can't resolve as an IPv6 next-hop.
|
||||
export filter {
|
||||
if proto = "lte_default6" then {
|
||||
bgp_next_hop = 2001:470:61a3:600::2;
|
||||
accept;
|
||||
}
|
||||
reject;
|
||||
};
|
||||
};
|
||||
}
|
||||
notify: Reload bird
|
||||
|
||||
- name: Enable and start BIRD2 service
|
||||
community.openwrt.service:
|
||||
name: bird
|
||||
enabled: true
|
||||
state: started
|
||||
@@ -20,10 +20,11 @@
|
||||
# output: ACCEPT (AP itself initiates outbound — opkg, NTP, etc.)
|
||||
# forward: REJECT (AP does not route client traffic through uplink)
|
||||
#
|
||||
# wwan — LTE modem uplink (Orange PL, /dev/cdc-wdm0, disabled by default)
|
||||
# wwan — LTE modem uplink (Orange PL, /dev/cdc-wdm0, always-on)
|
||||
# input: REJECT (no inbound from LTE)
|
||||
# output: ACCEPT (AP itself uses LTE for outbound when uplink unavailable)
|
||||
# forward: REJECT (no client traffic through LTE)
|
||||
# forward: REJECT (default; overridden by explicit uplink→wwan forwarding rule)
|
||||
# masq/masq6: enabled — NAT all traffic exiting via wwan (own + forwarded)
|
||||
#
|
||||
# No forwarding rules between zones — all inter-zone policy is on MikroTik.
|
||||
|
||||
@@ -75,6 +76,15 @@
|
||||
option input 'REJECT'
|
||||
option output 'ACCEPT'
|
||||
option forward 'REJECT'
|
||||
option masq '1'
|
||||
option masq6 '1'
|
||||
|
||||
# Forward traffic from MikroTik (arriving on uplink/vlan6) out through wwan
|
||||
# during LTE failover. MikroTik routes LAN/SRV/IoT traffic here when GPON
|
||||
# is down and the BGP-learned default via 192.168.6.2 is active.
|
||||
config forwarding
|
||||
option src 'uplink'
|
||||
option dest 'wwan'
|
||||
|
||||
config rule
|
||||
option name 'Allow-ICMPv6-uplink'
|
||||
|
||||
@@ -18,6 +18,9 @@
|
||||
- name: WWAN modem configuration
|
||||
ansible.builtin.import_tasks: wwan.yml
|
||||
|
||||
- name: BIRD2 BGP configuration
|
||||
ansible.builtin.import_tasks: bird.yml
|
||||
|
||||
- name: Firewall configuration
|
||||
ansible.builtin.import_tasks: firewall.yml
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
# mgmt — static 192.168.255.11/24 on eth0.1, management
|
||||
# lan — bridge (br-lan) on eth0.2, LAN clients via LAN ports
|
||||
# iot — bridge (br-iot) on eth0.5, IoT clients via wifi only
|
||||
# uplink — static 192.168.6.2/24 + 2001:470:61a3:600::2/64 on eth0.6, internet access for opkg
|
||||
# uplink — static 192.168.6.2/24 + 2001:470:61a3:600::2/64 on eth0.6, BGP peer link to CRS (no static gateway — default learned via BIRD2)
|
||||
# wwan — QMI LTE modem (/dev/cdc-wdm0), Orange PL dual-stack failover (APNs: internet + internetipv6)
|
||||
# Manual ifup only (option auto '0'); modem-specific quirks handled in wwan.yml.
|
||||
|
||||
@@ -156,17 +156,21 @@
|
||||
option pdptype 'ipv4v6'
|
||||
option dhcp '0'
|
||||
option dhcpv6 '0'
|
||||
option peerdns '0'
|
||||
option metric '100'
|
||||
# auto '0': netifd does not bring up wwan at boot. The modem takes
|
||||
# 30-90s after boot before its QMI service responds, and netifd's
|
||||
# retry/backoff handles this poorly (failed attempts leave the
|
||||
# interface in 'pending' state). A separate procd service waits
|
||||
# for the modem to be ready and triggers ifup wwan once.
|
||||
option auto '0'
|
||||
|
||||
config interface 'uplink'
|
||||
option device 'eth0.6'
|
||||
option proto 'static'
|
||||
option ipaddr '192.168.6.2/24'
|
||||
option gateway '192.168.6.1'
|
||||
option dns '192.168.6.1'
|
||||
option ip6addr '2001:470:61a3:600::2/64'
|
||||
option ip6gw '2001:470:61a3:600::1'
|
||||
|
||||
notify: Reload network
|
||||
|
||||
|
||||
@@ -28,3 +28,13 @@
|
||||
key: "dropbear.@dropbear[0].authorized_keys"
|
||||
value: "{{ openwrt_ssh_authorized_keys | join('\n') }}"
|
||||
when: openwrt_ssh_authorized_keys | length > 0
|
||||
|
||||
# The D-Link is a pure AP/relay — no local clients need DNS from it.
|
||||
# Disable dnsmasq entirely and point the system resolver directly at the
|
||||
# CRS (192.168.6.1), which is always reachable via vlan6 regardless of
|
||||
# WAN state and resolves using public upstream servers (1.1.1.1 etc.).
|
||||
- name: Disable dnsmasq service
|
||||
community.openwrt.service:
|
||||
name: dnsmasq
|
||||
enabled: false
|
||||
state: stopped
|
||||
|
||||
@@ -103,3 +103,138 @@
|
||||
community.openwrt.command:
|
||||
cmd: uqmi -t 3000 -d /dev/cdc-wdm0 --modify-profile 3gpp,2 --apn internetipv6 --pdp-type ipv6
|
||||
changed_when: false
|
||||
|
||||
# On cold boot the BM806C's UIM (SIM) QMI service comes up permanently
|
||||
# broken: --uim-get-sim-state returns {}, --get-imsi returns
|
||||
# "UIM uninitialized", AT+CPIN? returns +CME ERROR: SIM busy, and the
|
||||
# modem never converges (verified at uptime 21 min with no intervention).
|
||||
# CTL/NAS/WDS do come up after ~5 min of warmup, but UIM does not.
|
||||
#
|
||||
# A single USB re-enumeration of the device (authorized=0 / authorized=1)
|
||||
# forces the modem to redo its internal QMI service init from scratch.
|
||||
# After this, UIM comes up within ~1 s and ifup wwan succeeds normally.
|
||||
#
|
||||
# We use authorized=0/1 rather than usb/unbind+bind because the former
|
||||
# keeps qmi_wwan in the bound-drivers list and the kernel re-runs its
|
||||
# bind machinery for us; the latter detaches and re-attaches drivers
|
||||
# explicitly. Both work; authorized is cleaner.
|
||||
#
|
||||
# Full investigation, ruled-out hypotheses, and reproduction steps:
|
||||
# /root/wwan-diag/boot-wedge-investigation.md on the router.
|
||||
- name: Install wwan-bringup worker script
|
||||
community.openwrt.copy:
|
||||
dest: /usr/libexec/wwan-bringup
|
||||
mode: '0755'
|
||||
owner: root
|
||||
group: root
|
||||
content: |
|
||||
#!/bin/sh
|
||||
# Force-clean BM806C cold-boot UIM wedge by re-enumerating the USB
|
||||
# device once, then bring up wwan. Called by /etc/init.d/wwan-bringup
|
||||
# as a procd service.
|
||||
|
||||
DEV=/dev/cdc-wdm0
|
||||
IFACE=wwan
|
||||
USB_PORT=1-1
|
||||
|
||||
log() {
|
||||
logger -t wwan-bringup "$1"
|
||||
}
|
||||
|
||||
# Wait for cold-boot enumeration of cdc-wdm0 (<=60s).
|
||||
waited=0
|
||||
while [ ! -e "$DEV" ]; do
|
||||
sleep 1
|
||||
waited=$((waited + 1))
|
||||
[ $waited -ge 60 ] && break
|
||||
done
|
||||
if [ ! -e "$DEV" ]; then
|
||||
log "$DEV never appeared within 60s; giving up"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Force-clean re-enumeration. The BM806C's UIM QMI service never
|
||||
# comes up on cold boot without this.
|
||||
log "BM806C cold-boot UIM workaround: re-authorizing $USB_PORT"
|
||||
echo 0 > /sys/bus/usb/devices/$USB_PORT/authorized
|
||||
sleep 3
|
||||
echo 1 > /sys/bus/usb/devices/$USB_PORT/authorized
|
||||
|
||||
# Wait for cdc-wdm0 to return after re-enumeration (<=30s).
|
||||
waited=0
|
||||
while [ ! -e "$DEV" ]; do
|
||||
sleep 1
|
||||
waited=$((waited + 1))
|
||||
[ $waited -ge 30 ] && break
|
||||
done
|
||||
if [ ! -e "$DEV" ]; then
|
||||
log "$DEV did not return after re-auth; giving up"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# qmi.sh's own SIM-init and network-registration loops handle the
|
||||
# small remaining warmup (~5-30s) gracefully now that UIM is healthy.
|
||||
log "bringing up $IFACE"
|
||||
ifup "$IFACE"
|
||||
|
||||
# qmi.sh installs an IPv6 default route with a source-specific prefix
|
||||
# constraint (`default from 2a00:f44:.../64 ...`). This means only
|
||||
# traffic sourced from the wwan IPv6 prefix uses it — forwarded traffic
|
||||
# from internal subnets fails routing lookup with "net unreachable"
|
||||
# before masquerade can rewrite the source. Add a non-source-specific
|
||||
# default at a higher metric so forwarded traffic has a valid route,
|
||||
# gets routed out wwan0, then masqueraded by fw4.
|
||||
#
|
||||
# Wait up to 90s for qmi.sh to install its source-specific default,
|
||||
# then derive the gateway and add a regular default route.
|
||||
waited=0
|
||||
while [ $waited -lt 90 ]; do
|
||||
gw6=$(ip -6 route show default dev wwan0 2>/dev/null | awk '/^default from/ {print $5; exit}')
|
||||
if [ -n "$gw6" ]; then
|
||||
if ip -6 route show default dev wwan0 | grep -qE "^default via "; then
|
||||
log "non-source-specific IPv6 default already present"
|
||||
else
|
||||
log "adding non-source-specific IPv6 default via $gw6"
|
||||
ip -6 route add default via "$gw6" dev wwan0 metric 1024
|
||||
fi
|
||||
break
|
||||
fi
|
||||
sleep 3
|
||||
waited=$((waited + 3))
|
||||
done
|
||||
[ -z "$gw6" ] && log "warning: wwan IPv6 gateway never appeared, skipping default route"
|
||||
|
||||
- name: Install wwan-bringup init script
|
||||
community.openwrt.copy:
|
||||
dest: /etc/init.d/wwan-bringup
|
||||
mode: '0755'
|
||||
owner: root
|
||||
group: root
|
||||
content: |
|
||||
#!/bin/sh /etc/rc.common
|
||||
# Starts the wwan-bringup worker which re-enumerates the BM806C USB
|
||||
# device once to clear the cold-boot UIM wedge, then triggers
|
||||
# `ifup wwan`. See /usr/libexec/wwan-bringup.
|
||||
|
||||
START=99
|
||||
USE_PROCD=1
|
||||
|
||||
# One-shot script: launch the worker directly without procd_open_instance
|
||||
# so procd does not respawn it after successful exit.
|
||||
PIDFILE=/var/run/wwan-bringup.pid
|
||||
|
||||
start_service() {
|
||||
/usr/libexec/wwan-bringup &
|
||||
echo $! > $PIDFILE
|
||||
}
|
||||
|
||||
stop_service() {
|
||||
[ -f $PIDFILE ] && kill "$(cat $PIDFILE)" 2>/dev/null
|
||||
rm -f $PIDFILE
|
||||
}
|
||||
|
||||
- name: Enable and start wwan-bringup service
|
||||
community.openwrt.service:
|
||||
name: wwan-bringup
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
@@ -51,10 +51,10 @@
|
||||
data:
|
||||
- interface: pppoe-gpon
|
||||
list: wan
|
||||
- interface: lte1
|
||||
list: wan
|
||||
- interface: sit1
|
||||
list: wan
|
||||
- interface: vlan6
|
||||
list: wan
|
||||
handle_absent_entries: remove
|
||||
handle_entries_content: remove_as_much_as_possible
|
||||
|
||||
|
||||
@@ -10,11 +10,6 @@
|
||||
chain: forward
|
||||
comment: Allow all already established connections
|
||||
connection-state: established,related
|
||||
- action: accept
|
||||
chain: forward
|
||||
comment: Allow LTE modem management (next rule forbids it otherwise)
|
||||
dst-address: 192.168.8.1
|
||||
out-interface: lte1
|
||||
- action: reject
|
||||
chain: forward
|
||||
comment: Forbid forwarding 192.168.0.0/16 to WAN
|
||||
@@ -173,7 +168,13 @@
|
||||
comment: Allow BGP from SRV
|
||||
dst-port: 179
|
||||
in-interface: vlan4
|
||||
protocol: udp
|
||||
protocol: tcp
|
||||
- action: accept
|
||||
chain: input
|
||||
comment: Allow BGP from OPENWRT UPLINK
|
||||
dst-port: 179
|
||||
in-interface: vlan6
|
||||
protocol: tcp
|
||||
- action: accept
|
||||
chain: input
|
||||
comment: NAT-PMP from LAN
|
||||
@@ -243,15 +244,11 @@
|
||||
- action: masquerade
|
||||
chain: srcnat
|
||||
comment: Masquerade to internet
|
||||
out-interface-list: wan
|
||||
out-interface: pppoe-gpon
|
||||
- action: masquerade
|
||||
chain: srcnat
|
||||
comment: GPON ONT management
|
||||
dst-address: 192.168.100.1
|
||||
- action: masquerade
|
||||
chain: srcnat
|
||||
comment: LTE Modem management
|
||||
dst-address: 192.168.8.1
|
||||
- action: dst-nat
|
||||
chain: dstnat
|
||||
comment: TS3
|
||||
@@ -516,6 +513,13 @@
|
||||
in-interface: vlan4
|
||||
protocol: tcp
|
||||
src-address: 2001:470:61a3:100::/64
|
||||
- action: accept
|
||||
chain: input
|
||||
comment: Allow BGP from OPENWRT UPLINK
|
||||
dst-port: 179
|
||||
in-interface: vlan6
|
||||
protocol: tcp
|
||||
src-address: 2001:470:61a3:600::/64
|
||||
- action: reject
|
||||
chain: input
|
||||
comment: Reject all remaining
|
||||
|
||||
@@ -39,39 +39,6 @@
|
||||
loop_control:
|
||||
label: "{{ item.default_name }}"
|
||||
|
||||
- name: Configure LTE interface defaults
|
||||
community.routeros.api_find_and_modify:
|
||||
ignore_dynamic: false
|
||||
path: interface lte
|
||||
find:
|
||||
default-name: lte1
|
||||
values:
|
||||
apn-profiles: default-nodns
|
||||
comment: Backup LTE WAN
|
||||
|
||||
- name: Configure LTE APN profiles
|
||||
community.routeros.api_modify:
|
||||
path: interface lte apn
|
||||
data:
|
||||
- add-default-route: false
|
||||
apn: internet
|
||||
comment: default but without dns and default route
|
||||
ipv6-interface: lte1
|
||||
name: default-nodns
|
||||
use-network-apn: true
|
||||
use-peer-dns: false
|
||||
# Default APN we can't really remove yet I don't want to reconfigure it
|
||||
- add-default-route: true
|
||||
apn: internet
|
||||
authentication: none
|
||||
default-route-distance: 2
|
||||
ip-type: auto
|
||||
name: default
|
||||
use-network-apn: true
|
||||
use-peer-dns: true
|
||||
handle_absent_entries: remove
|
||||
handle_entries_content: remove_as_much_as_possible
|
||||
|
||||
- name: Configure temporary disk for containers
|
||||
community.routeros.api_modify:
|
||||
path: disk
|
||||
|
||||
@@ -21,15 +21,6 @@
|
||||
suppress-hw-offload: false
|
||||
target-scope: 10
|
||||
vrf-interface: pppoe-gpon
|
||||
- disabled: false
|
||||
distance: 2
|
||||
dst-address: 0.0.0.0/0
|
||||
gateway: 192.168.8.1
|
||||
routing-table: main
|
||||
scope: 30
|
||||
suppress-hw-offload: false
|
||||
target-scope: 10
|
||||
vrf-interface: lte1
|
||||
handle_absent_entries: remove
|
||||
handle_entries_content: remove_as_much_as_possible
|
||||
|
||||
@@ -93,5 +84,27 @@
|
||||
remote.address: 2001:470:61a3:100::3/128
|
||||
routing-table: main
|
||||
templates: klaster
|
||||
- name: dlink-lte
|
||||
afi: ip,ipv6
|
||||
as: 65000
|
||||
connect: true
|
||||
disabled: false
|
||||
instance: bgp-homelab
|
||||
listen: true
|
||||
# ibgp-rr: CRS acts as route reflector for D-Link (the RR client).
|
||||
# This allows k8s routes learned from bgp1 to be reflected to D-Link
|
||||
# without violating iBGP split-horizon.
|
||||
local.role: ibgp-rr
|
||||
remote.address: 192.168.6.2/32
|
||||
routing-table: main
|
||||
templates: klaster
|
||||
hold-time: 30s
|
||||
keepalive-time: 10s
|
||||
# Redistribute connected (VLAN addresses) and static routes (Tailscale,
|
||||
# GPON default) so D-Link has explicit routes to all internal subnets
|
||||
# and a default route when GPON is up.
|
||||
output.redistribute: connected,static
|
||||
output.default-originate: if-installed
|
||||
nexthop-choice: force-self
|
||||
handle_absent_entries: remove
|
||||
handle_entries_content: remove_as_much_as_possible
|
||||
|
||||
@@ -0,0 +1,255 @@
|
||||
# LTE Failover Design
|
||||
|
||||
Reference documentation of the as-built LTE failover design. For day-to-day
|
||||
network overview see [network.md](./network.md); for BM806C modem firmware
|
||||
workarounds see [wwan-bm806c-qmi-workaround.md](./wwan-bm806c-qmi-workaround.md).
|
||||
|
||||
## Summary
|
||||
|
||||
| Property | Value |
|
||||
|---|---|
|
||||
| Failover signalling | Symmetric iBGP between D-Link (BIRD2) and CRS (RouterOS) |
|
||||
| BGP AS | 65000 (iBGP; CRS acts as route reflector for D-Link) |
|
||||
| LTE transit path | D-Link wwan ← VLAN 6 (192.168.6.0/24) ← CRS |
|
||||
| D-Link default route source | Learned from CRS via BGP (no static default gateway) |
|
||||
| CRS LTE route source | Learned from D-Link via BGP at distance 200 |
|
||||
| Announcement trigger | wwan interface up/down tracked by BIRD2 device protocol |
|
||||
| Scope | All internet-capable VLANs (vlan2, vlan4, vlan5, vlan6) |
|
||||
| IPv4 NAT | CRS masquerades on `pppoe-gpon` only; D-Link masquerades on `wwan` |
|
||||
| IPv6 NAT | D-Link masquerades IPv6 on `wwan` (no inbound on LTE; outbound only) |
|
||||
| wwan bringup | Triggered by `/etc/init.d/wwan-bringup` after USB re-auth (BM806C wedge fix) |
|
||||
|
||||
## Route exchange
|
||||
|
||||
### CRS announces to D-Link
|
||||
|
||||
| Prefix | Source | Withdrawn when |
|
||||
|---|---|---|
|
||||
| `0.0.0.0/0` | `output.default-originate: if-installed` (active default in main table) | GPON drops or `pppoe-gpon` route inactive |
|
||||
| `2000::/3` | `output.redistribute: static` (HE tunnel default) | `sit1` interface down / HE route inactive |
|
||||
| VLAN subnets (`192.168.0.0/24`, `192.168.1.0/24`, etc.) | `output.redistribute: connected` | never (CRS always reachable on vlan6) |
|
||||
| `100.64.0.0/10` (Tailscale) | `output.redistribute: static` | never |
|
||||
| `172.17.0.0/16` (dockers bridge) | `output.redistribute: connected` | never |
|
||||
| `10.42.0.0/16`, `10.43.0.0/16`, `10.44.0.0/16` (k8s) | reflected via iBGP RR | when k8s BGP session drops |
|
||||
| pod/service/LB IPv6 ranges | reflected via iBGP RR | when k8s BGP session drops |
|
||||
|
||||
Internal prefixes are announced regardless of GPON state. They remain
|
||||
reachable via `192.168.6.1` (directly connected on vlan6) even when GPON
|
||||
fails, so D-Link-originated traffic to internal subnets always routes to
|
||||
CRS rather than incorrectly exiting via wwan.
|
||||
|
||||
The CRS route reflector role (`local.role: ibgp-rr` on the `dlink-lte`
|
||||
connection) allows it to reflect routes learned from the k8s peer (`bgp1`)
|
||||
to D-Link without violating iBGP split-horizon. RFC 4456 `ORIGINATOR_ID`
|
||||
loop prevention is handled automatically by RouterOS — no output filter
|
||||
needed.
|
||||
|
||||
`nexthop-choice: force-self` ensures CRS advertises `192.168.6.1` as the
|
||||
next-hop for all prefixes, rather than the original route's next-hop
|
||||
(which may be unreachable from D-Link, e.g. k8s peer `2001:470:61a3:100::3`).
|
||||
|
||||
### D-Link announces to CRS
|
||||
|
||||
| Prefix | Source | Withdrawn when |
|
||||
|---|---|---|
|
||||
| `0.0.0.0/0` | BIRD2 static `lte_default` via `wwan0` | wwan0 down (device protocol detects) |
|
||||
| `2000::/3` | BIRD2 static `lte_default6` via `wwan0` | wwan0 down |
|
||||
|
||||
BIRD2's `protocol device` tracks wwan0 via netlink in real time; when the
|
||||
interface goes down the static routes become unreachable and BGP withdraws
|
||||
the announcements immediately.
|
||||
|
||||
The BIRD2 static routes use `preference 50` (below the BGP default of 100)
|
||||
so the BGP-learned routes from CRS are preferred for kernel installation
|
||||
on D-Link itself — D-Link's own outbound traffic uses the CRS path when
|
||||
GPON is up. The static routes only exist as triggers for BGP export.
|
||||
|
||||
### D-Link kernel routing table
|
||||
|
||||
| Destination | Source | Kernel metric | Active when |
|
||||
|---|---|---|---|
|
||||
| Internal prefixes (VLANs, k8s, Tailscale) | BGP from CRS, via `192.168.6.1` | 10 (IPv4) / 32 (IPv6) | always (CRS reachable) |
|
||||
| `0.0.0.0/0` | BGP from CRS | 10 | GPON up |
|
||||
| `0.0.0.0/0` | wwan QMI-assigned (qmi.sh) | 100 | wwan up |
|
||||
| `default via wwan IPv6 GW` (non-source-specific) | wwan-bringup script | 1024 | wwan up |
|
||||
| `default from <wwan prefix>/64 via wwan IPv6 GW` (source-specific) | qmi.sh | 100 | wwan up |
|
||||
|
||||
D-Link's own outbound traffic prefers the BGP route (metric 10) over wwan
|
||||
(metric 100). The non-source-specific IPv6 default at metric 1024 exists
|
||||
because qmi.sh only installs a source-specific IPv6 default (constrained
|
||||
to the wwan-assigned `/64` prefix); forwarded traffic from internal
|
||||
subnets would fail routing lookup with "net unreachable" without it.
|
||||
|
||||
### CRS routing table
|
||||
|
||||
| Destination | Source | Distance | Active when |
|
||||
|---|---|---|---|
|
||||
| `0.0.0.0/0` | static via `pppoe-gpon` | 1 | GPON up |
|
||||
| `0.0.0.0/0` | BGP from D-Link via `192.168.6.2` | 200 | wwan up on D-Link |
|
||||
| `2000::/3` | static via `sit1` (HE tunnel) | 1 | sit1 active (HE tunnel works) |
|
||||
| `2000::/3` | BGP from D-Link via `2001:470:61a3:600::2` | 200 | wwan up on D-Link |
|
||||
|
||||
RouterOS distance comparison is straightforward: distance 1 always wins
|
||||
over distance 200. BGP-learned routes activate automatically when the
|
||||
static route becomes inactive (e.g. GPON down → `pppoe-gpon` route
|
||||
inactive → BGP route at distance 200 becomes active).
|
||||
|
||||
## Traffic paths
|
||||
|
||||
### Normal (GPON up)
|
||||
|
||||
```
|
||||
LAN/SRV/IoT → CRS → pppoe-gpon → ISP
|
||||
D-Link own → uplink → CRS → pppoe-gpon → ISP
|
||||
(via BGP-learned default at kernel metric 10)
|
||||
```
|
||||
|
||||
wwan is connected and D-Link announces the LTE default to CRS, but CRS
|
||||
ignores it (distance 200 loses to distance 1). D-Link uses the
|
||||
CRS-announced default (metric 10) for its own traffic, not wwan
|
||||
(metric 100).
|
||||
|
||||
### Failover (GPON down)
|
||||
|
||||
```
|
||||
LAN/SRV/IoT → CRS → vlan6 (→192.168.6.2) → D-Link → wwan → Orange LTE
|
||||
D-Link own → wwan → Orange LTE
|
||||
```
|
||||
|
||||
CRS distance-1 routes go inactive → distance-200 BGP routes from D-Link
|
||||
activate. D-Link receives forwarded traffic on uplink, routes it via the
|
||||
non-source-specific wwan default (metric 1024), fw4 masquerades the
|
||||
source, packet exits via wwan. Return traffic reverses through masquerade
|
||||
state and forwards back to CRS via the established connection-tracking
|
||||
entry.
|
||||
|
||||
When CRS withdraws its BGP-announced default to D-Link (because GPON is
|
||||
down and CRS has no default to announce), D-Link's kernel default at
|
||||
metric 10 is removed, leaving the wwan default at metric 100 as the
|
||||
preferred route for D-Link's own traffic.
|
||||
|
||||
### Failure detection
|
||||
|
||||
- **D-Link crashes / power loss** → BGP session drops after `hold-time: 30s`
|
||||
→ CRS withdraws all D-Link-learned routes → internet unavailable if
|
||||
GPON also down (acceptable single-point-of-failure for home network)
|
||||
- **wwan modem goes down** → BIRD2 device protocol detects wwan0 down →
|
||||
static `lte_default` / `lte_default6` become unreachable → BGP withdraws
|
||||
announcements → CRS removes BGP-learned default
|
||||
- **GPON drops** → `pppoe-gpon` interface down → CRS distance-1 default
|
||||
route inactive → distance-200 BGP route activates → CRS withdraws its
|
||||
default-originate announcement to D-Link (since no default is installed
|
||||
any more) → D-Link's kernel default-via-CRS is removed → D-Link uses
|
||||
wwan kernel default → traffic flows from CRS via vlan6 → D-Link → wwan
|
||||
|
||||
All transitions are automatic and driven by interface state. No active
|
||||
probing (Netwatch / mwan3), no scripts toggling routes.
|
||||
|
||||
## NAT rules
|
||||
|
||||
NAT rules are always active, matched by output interface. No
|
||||
failover-triggered toggling needed.
|
||||
|
||||
### CRS (RouterOS)
|
||||
|
||||
- IPv4 `masquerade` on `srcnat` chain with `out-interface: pppoe-gpon`.
|
||||
Only the GPON public interface gets masqueraded — `vlan6` is internal
|
||||
and never natted, `sit1` (IPv6) has its own dedicated src-nat for the
|
||||
Tailscale prefix.
|
||||
- IPv6 `src-nat tailnet to internet` on `srcnat` chain for Tailscale
|
||||
prefix (`fd7a:115c:a1e0::/48`) to `2001:470:61a3:600::/64`, applied
|
||||
on `out-interface-list: wan`. Fires regardless of whether the
|
||||
egress is `sit1` or `vlan6`.
|
||||
|
||||
### D-Link (OpenWrt fw4)
|
||||
|
||||
- `wwan` zone has `option masq '1'` and `option masq6 '1'`. All traffic
|
||||
exiting via wwan (own outbound + forwarded from `uplink`) is
|
||||
source-NAT'd, IPv4 to the wwan-assigned CG-NAT IP, IPv6 to the
|
||||
wwan-assigned `/128` from the Orange-assigned `/64` prefix.
|
||||
- Forwarding rule `uplink → wwan` allows MikroTik-routed traffic to
|
||||
egress via wwan during failover. Default forward policy on the wwan
|
||||
zone stays REJECT.
|
||||
|
||||
## BGP / route reflection details
|
||||
|
||||
### CRS connection config
|
||||
|
||||
```
|
||||
/routing/bgp/connection set dlink-lte \
|
||||
remote.address=192.168.6.2/32 \
|
||||
local.role=ibgp-rr \
|
||||
nexthop-choice=force-self \
|
||||
output.redistribute=connected,static \
|
||||
output.default-originate=if-installed \
|
||||
hold-time=30s keepalive-time=10s
|
||||
```
|
||||
|
||||
`output.default-originate=if-installed` is required for the `0.0.0.0/0`
|
||||
advertisement because RouterOS does not advertise interface-gateway
|
||||
static routes (gateway=`pppoe-gpon`) via plain `output.redistribute=static`.
|
||||
`default-originate` advertises a synthetic default whenever any active
|
||||
default exists in the routing table, regardless of how it was installed.
|
||||
|
||||
### IPv6 Extended Next Hop workaround
|
||||
|
||||
RouterOS uses BGP Extended Next Hop Encoding (RFC 5549 / RFC 8950) for
|
||||
IPv6 routes on this iBGP session, advertising them with an IPv4-mapped
|
||||
next-hop (`::ffff:192.168.6.1`). The Linux kernel does not support
|
||||
installing IPv6 routes with IPv4 next-hops, so BIRD2 cannot push them
|
||||
directly to the kernel.
|
||||
|
||||
There is no way to disable ENHE on RouterOS — `local.address`,
|
||||
`nexthop-choice: force-self`, and output `set gw` filters all fail to
|
||||
override it. The workaround is on the BIRD2 side: an import filter on
|
||||
the BGP IPv6 channel rewrites `gw` to CRS's native IPv6 address
|
||||
(`2001:470:61a3:600::1`) before the route is exported to the kernel.
|
||||
|
||||
```
|
||||
ipv6 {
|
||||
extended next hop yes;
|
||||
import filter {
|
||||
gw = 2001:470:61a3:600::1;
|
||||
accept;
|
||||
};
|
||||
...
|
||||
};
|
||||
```
|
||||
|
||||
The reverse direction (D-Link → CRS) was solved cleanly via BIRD2 export
|
||||
filter setting `bgp_next_hop = 2001:470:61a3:600::2`, since BGP-level
|
||||
attribute manipulation isn't constrained by kernel limitations.
|
||||
|
||||
### Direct protocol on D-Link
|
||||
|
||||
BIRD2 needs to know about the directly connected `192.168.6.0/24` and
|
||||
`2001:470:61a3:600::/64` subnets on `eth0.6` to resolve BGP next-hops.
|
||||
The `protocol direct { interface "eth0.6"; }` declaration provides this;
|
||||
without it BIRD2 marks all CRS-learned routes as unreachable.
|
||||
|
||||
## BM806C modem cold-boot wedge
|
||||
|
||||
The BM806C firmware enters a permanently broken state on cold boot:
|
||||
`/dev/cdc-wdm0` exists, kernel driver attaches, but uqmi commands return
|
||||
`"Failed to connect to service"` indefinitely. UIM (SIM) QMI service
|
||||
specifically never comes up.
|
||||
|
||||
Recovery requires a USB device re-enumeration. The `/etc/init.d/wwan-bringup`
|
||||
service writes `0` then `1` to `/sys/bus/usb/devices/1-1/authorized` on
|
||||
boot, then triggers `ifup wwan`. After re-auth the modem completes its
|
||||
QMI initialization within ~1 second.
|
||||
|
||||
Full investigation: see [wwan-bm806c-qmi-workaround.md](./wwan-bm806c-qmi-workaround.md).
|
||||
|
||||
## Implementation files
|
||||
|
||||
| File | Role |
|
||||
|---|---|
|
||||
| `ansible/roles/routeros/tasks/base.yml` | `vlan6` in `wan` interface list |
|
||||
| `ansible/roles/routeros/tasks/routing.yml` | BGP instance, template, `dlink-lte` connection |
|
||||
| `ansible/roles/routeros/tasks/firewall.yml` | IPv4 masquerade narrowed to `pppoe-gpon`; BGP input rules for `vlan6` |
|
||||
| `ansible/roles/openwrt/tasks/network.yml` | `wwan` interface (no auto bringup); `uplink` with no static gateway |
|
||||
| `ansible/roles/openwrt/tasks/firewall.yml` | `wwan` zone with `masq '1'` / `masq6 '1'`; `uplink → wwan` forwarding |
|
||||
| `ansible/roles/openwrt/tasks/bird.yml` | BIRD2 install + config |
|
||||
| `ansible/roles/openwrt/tasks/wwan.yml` | qmi.sh patches, BM806C profiles, wwan-bringup init script |
|
||||
| `ansible/roles/openwrt/defaults/main.yml` | `bird2` in `openwrt_packages` |
|
||||
+7
-2
@@ -115,8 +115,13 @@ One of quirks of the ISP is that it doesn't allow incoming port 53/DNS connectio
|
||||
|
||||
The ISP does not provide any IPv6 connectivity at all. For that purpose I'm using [tunnel broker from Hurricane Electric](https://tunnelbroker.net/), which gives /48 routed prefix that I divided to /64 networks.
|
||||
|
||||
There used to be backup internet link using USB LTE modem connected to CRS, which was exposing NDIS interface, but when installing D-Link I decided to remove the modem and move SIM card to it to reduce clutter in rack and have direct access to fully fledged modem, not just web interface management. Configuration of lte1 modem is yet to be removed from the CRS configuration. Modem in D-Link requires workaround to work due to firmware bug, described in detail in [LTE failover (BroadMobi BM806C / D-Link DWR-921 C1) — QMI data-plane workaround](./wwan-bm806c-qmi-workaround.md). It is currently partially configured, with internet working on OpenWRT router when enabled, but failover functionality of internet gateway on CRS is yet to be designed and implemented.
|
||||
SIM card allows for IPv4 and IPv6 connectivity via separate APNs. Network hands out globally routable IPv6 prefix, but there are no incoming IPv6 connections, which is most likely network carrier enforced firewall. Network works when using two different APNs at once, but when using the card in Android phone, there's no need to configure two separate APNs, IPv6 alone is sufficient. Whether the network announces NAT64 and Android phone is doing CLAT or how is that working exactly and if we can utilize it in our network to simplify connection is yet to be figured out.
|
||||
The backup internet link is an LTE connection via the embedded BroadMobi BM806C modem in the D-Link router (Orange Poland, dual-stack). The SIM was previously in a USB modem attached directly to the CRS; it was moved to the D-Link to reduce rack clutter and gain access to a proper modem interface. The modem requires firmware-level workarounds — QMI data-plane bugs, a cold-boot UIM wedge that needs USB re-enumeration — documented in [LTE failover (BroadMobi BM806C / D-Link DWR-921 C1) — QMI data-plane workaround](./wwan-bm806c-qmi-workaround.md).
|
||||
|
||||
Failover is implemented using iBGP between the D-Link (BIRD2, AS 65000) and the CRS (`local.role: ibgp-rr` so CRS acts as route reflector for D-Link). The D-Link announces `0.0.0.0/0` and `2000::/3` to the CRS whenever its `wwan` interface is up. The CRS installs these at BGP distance 200 — below the GPON static default at distance 1 — so they only become active when GPON fails. The CRS in turn announces all its connected and static routes (VLAN subnets, Tailscale, k8s pod/service/LB prefixes via RR reflection) to the D-Link so it always has explicit routes to internal subnets regardless of WAN state. The D-Link's own default route also comes from this BGP session (no static gateway on the uplink interface); when the CRS withdraws the default on GPON failure, the D-Link falls back to its wwan kernel route at metric 100.
|
||||
|
||||
For full design rationale, route exchange tables, and implementation notes including the BGP Extended Next Hop workarounds, see [LTE failover design](./lte-failover-design.md).
|
||||
|
||||
During LTE failover, all VLANs route through `vlan6` to the D-Link, which forwards traffic out `wwan` and masquerades it (IPv4 and IPv6 via fw4 `masq`/`masq6`). IPv6 is outbound-only — the carrier enforces an inbound firewall, and there is no routed prefix large enough to cover all internal subnets without NAT.
|
||||
|
||||
## Configuration management
|
||||
|
||||
|
||||
@@ -1,25 +1,35 @@
|
||||
# LTE failover (BroadMobi BM806C / D-Link DWR-921 C1) — QMI data-plane workaround
|
||||
|
||||
Last verified: 2026-05-16, OpenWrt 25.12.2 r32802-f505120278, netifd 2026.02.26~cbb83a18-r1.
|
||||
Last verified: 2026-05-27, OpenWrt 25.12.2 r32802-f505120278, netifd 2026.02.26~cbb83a18-r1.
|
||||
|
||||
## TL;DR
|
||||
|
||||
The embedded BroadMobi BM806C modem in the D-Link DWR-921 attaches to
|
||||
LTE, gets assigned IP addresses through QMI, reports `"connected"` —
|
||||
but **no downlink data passes**. Every TCP SYN we send out is dropped
|
||||
somewhere between the modem and the host kernel, and we never see a
|
||||
SYN-ACK. After several hours of layered diagnostics we identified two
|
||||
independent issues, both of which must be fixed for QMI to work on this
|
||||
device:
|
||||
The embedded BroadMobi BM806C modem in the D-Link DWR-921 has **three
|
||||
independent bugs** in its firmware (`M1.2.0_E1.0.1_A1.1.8`, the only
|
||||
build that has ever shipped), all of which must be worked around for a
|
||||
usable LTE uplink:
|
||||
|
||||
1. **`qmi.sh` requests `802.3` framing** from the modem.
|
||||
1. **Cold-boot UIM wedge.** On every cold boot, the modem's UIM (SIM)
|
||||
QMI service comes up permanently broken: `--uim-get-sim-state`
|
||||
returns `{}`, `--get-imsi` returns `"UIM uninitialized"`, and
|
||||
`AT+CPIN?` returns `+CME ERROR: SIM busy`. The modem **never
|
||||
recovers on its own** (verified at uptime 21 min). A single USB
|
||||
re-enumeration (`echo 0 > /sys/.../1-1/authorized; sleep 3; echo 1
|
||||
> ...`) forces the modem to redo its internal QMI init from
|
||||
scratch, after which UIM comes up within ~1 s. The
|
||||
`wwan-bringup` service installed by this role does the
|
||||
re-enumeration unconditionally on boot, then calls `ifup wwan`.
|
||||
Full investigation: `/root/wwan-diag/boot-wedge-investigation.md`
|
||||
on the router.
|
||||
|
||||
2. **`qmi.sh` requests `802.3` framing** from the modem.
|
||||
The BM806C's `802.3` firmware path is buggy on this generation of
|
||||
Qualcomm silicon; raw-ip framing works correctly. The same kernel
|
||||
maintainer who added raw-ip support to `qmi_wwan` documents
|
||||
"buggy 802.3 firmware implementation" as a known issue for the
|
||||
MDM9x25 family this modem is built on.
|
||||
|
||||
2. **`qmi.sh` calls `uqmi --start-network --apn <foo>`** to bring up
|
||||
3. **`qmi.sh` calls `uqmi --start-network --apn <foo>`** to bring up
|
||||
the bearer. On BM806C this triggers a known firmware bug
|
||||
([OpenWrt FS#1363](https://github.com/openwrt/openwrt/issues/6295))
|
||||
that establishes a *phantom* bearer: kernel and modem agree there is
|
||||
@@ -29,18 +39,48 @@ device:
|
||||
<N>` against a pre-configured NVRAM profile **with the same APN**
|
||||
works perfectly.
|
||||
|
||||
Our workaround patches `qmi.sh` in two places (raw-ip + a kernel
|
||||
`-EBUSY` fix), creates a second NVRAM profile in the modem for the
|
||||
IPv6 APN, and adds `option profile`/`option v6profile` to the UCI
|
||||
`wwan` interface so `qmi.sh` uses the working code path. After the
|
||||
workaround, `ifup wwan` produces a fully working dual-stack IPv4 +
|
||||
IPv6 LTE uplink — verified end-to-end at HTTPS layer to multiple
|
||||
Bug 1 is the boot-time wedge; without the workaround `wwan` simply
|
||||
never comes up after a reboot. Bugs 2 and 3 are about the data plane
|
||||
itself; without their workarounds, `wwan` comes up but no traffic
|
||||
flows. Our role addresses all three: it installs `wwan-bringup`
|
||||
(re-enumerates the USB device once on boot, then `ifup wwan`), patches
|
||||
`qmi.sh` in two places (raw-ip + a kernel `-EBUSY` fix), creates a
|
||||
second NVRAM profile in the modem for the IPv6 APN, and adds
|
||||
`option profile`/`option v6profile` to the UCI `wwan` interface so
|
||||
`qmi.sh` uses the working code path. After all three workarounds,
|
||||
cold boot to working dual-stack IPv4+IPv6 LTE uplink completes in
|
||||
~2:30–3:30 — verified end-to-end at HTTPS layer to multiple
|
||||
upstreams.
|
||||
|
||||
## Symptoms
|
||||
|
||||
When QMI is broken on this modem, all of the following are true at the
|
||||
same time:
|
||||
### Boot-wedge symptoms (bug 1)
|
||||
|
||||
When the modem boots into the UIM-wedged state, all of the following
|
||||
hold simultaneously:
|
||||
|
||||
- `/dev/cdc-wdm0` exists, `wwan0` netdev exists, `qmi_wwan` driver is
|
||||
bound to `1-1:1.4` — kernel side looks fine
|
||||
- `ifup wwan` runs forever in the SIM-init loop:
|
||||
`wwan: SIM in illegal state - Power-cycling SIM` repeating every ~8 s
|
||||
- `uqmi -d /dev/cdc-wdm0 --uim-get-sim-state` returns `{}` (empty
|
||||
body — no `card_application_state` field at all)
|
||||
- `uqmi -d /dev/cdc-wdm0 --get-imsi` returns the QMI string
|
||||
`"UIM uninitialized"`
|
||||
- `uqmi -d /dev/cdc-wdm0 --get-pin-status` returns
|
||||
`"Invalid arguments given"` (uqmi cannot allocate a UIM client
|
||||
because the modem-side service has not registered)
|
||||
- AT side: `AT+CFUN?` returns `+CFUN: 1` (modem firmware is alive),
|
||||
`AT+CPIN?` returns `+CME ERROR: SIM busy`, and `AT+CREG?` /
|
||||
`AT+CEREG?` / `AT+COPS?` all return bare `ERROR`
|
||||
- This persists indefinitely; we measured no recovery at uptime
|
||||
21 min
|
||||
|
||||
### Data-plane symptoms (bugs 2 and 3)
|
||||
|
||||
When the modem comes up cleanly but the qmi.sh patches are missing or
|
||||
the wrong `--start-network` invocation is used, all of the following
|
||||
are true at the same time:
|
||||
|
||||
- `ifup wwan` succeeds, `ifstatus wwan` reports `"up": true`
|
||||
- `wwan0` has a valid CG-NAT IPv4 (`10.x.x.x/30`) and IPv6
|
||||
@@ -184,9 +224,13 @@ You are affected if all of these hold:
|
||||
1. Your modem reports `Manufacturer: BroadMobi`, `Model: BM806C` (or
|
||||
`BM806U`), `Revision: M1.2.0_E1.0.1_A1.1.8`. Check via any AT port:
|
||||
`printf 'ATI\r' | picocom -qrx 3000 /dev/ttyUSB2`.
|
||||
2. Your USB IDs (after `usb-modeswitch` runs) are
|
||||
`2020:2033`. Check `/sys/bus/usb/devices/<port>/idVendor` /
|
||||
`idProduct`.
|
||||
2. Your USB IDs are `2020:2033`. Check
|
||||
`/sys/bus/usb/devices/<port>/idVendor` / `idProduct`. On the C1
|
||||
hardware revision the modem cold-boots directly into `2020:2033`
|
||||
QMI composite mode — no `usb-modeswitch` involved (there is no
|
||||
`2020:2033` entry in `/etc/usb-mode.json` on our build). Other
|
||||
hardware revisions may go through an EDL `05c6:9008` →
|
||||
`2020:2033` modeswitch first.
|
||||
3. `qmi.sh` (`/lib/netifd/proto/qmi.sh`) is the unmodified upstream
|
||||
netifd handler. Grep for `--wda-set-data-format 802.3` —
|
||||
if present, you have the unpatched script.
|
||||
@@ -207,11 +251,11 @@ data flowing with `--start-network --profile 1` but not with
|
||||
| uqmi | 2025.07.30~7914da43-r2 |
|
||||
| libqmi / qmi-utils | 1.36.0-r1 |
|
||||
| luci-proto-qmi | 26.133.20346~e9ebca7 |
|
||||
| qmi_wwan kernel driver | in-tree, kernel 6.12.74 |
|
||||
| qmi_wwan kernel driver | backports from Linux v6.18.7 (per dmesg) |
|
||||
| LTE modem | BroadMobi BM806C (Qualcomm MDM9225) |
|
||||
| Modem firmware | `M1.2.0_E1.0.1_A1.1.8` |
|
||||
| Modem USB id (data mode) | `2020:2033` |
|
||||
| Modem USB id (EDL mode) | `05c6:9008` (before `usb-modeswitch`) |
|
||||
| Modem USB id (data mode) | `2020:2033` (cold-boots directly into this) |
|
||||
| Modem USB id (EDL mode) | `05c6:9008` (not observed on C1; may apply to other revs) |
|
||||
| Mobile network | Orange Poland (MCC 260 / MNC 03) |
|
||||
| APN (IPv4 / dual-stack) | `internet` (auth: PAP, user/pass `internet`/`internet`) |
|
||||
| APN (IPv6) | `internetipv6` (same auth) |
|
||||
@@ -226,9 +270,25 @@ data flowing with `--start-network --profile 1` but not with
|
||||
documents the 802.3-firmware-is-buggy reality across this generation.
|
||||
Search the mainline kernel for `QMI_WWAN_FLAG_RAWIP`.
|
||||
- Kernel commit "net: qmi_wwan: add BroadMobi BM806U 2020:2033"
|
||||
(Pawel Dembicki, 2018): adds the `qmi_wwan` entry for our exact USB
|
||||
id `2020:2033`. The BM806C and BM806U share the device id and
|
||||
qmi_wwan driver path.
|
||||
(Pawel Dembicki, 2018, `6cb2669cb97f`): adds the `qmi_wwan` entry
|
||||
for our exact USB id `2020:2033` as `QMI_FIXED_INTF(0x2020, 0x2033, 4)`
|
||||
with no quirks. The BM806C and BM806U share the device id and
|
||||
qmi_wwan driver path. The entry has not been touched in mainline
|
||||
through v6.18.7 (what OpenWrt 25.12.2 ships via backports).
|
||||
- libqmi maintainer Aleksander Morgado on cdc-wdm port readiness
|
||||
timing (libqmi-devel, Sep 2021):
|
||||
<https://lists.freedesktop.org/archives/libqmi-devel/2021-September/003695.html>
|
||||
— explains that cdc-wdm appearing in `/dev` is not a guarantee that
|
||||
the modem-side QMI service is operational. ModemManager uses up to
|
||||
45 s of warmup tolerance; we measured this modem firmware needs
|
||||
~5 min before CTL is even responsive, and UIM never converges
|
||||
without a USB re-enumeration.
|
||||
- `CastixGitHub/re_wwan` (<https://github.com/CastixGitHub/re_wwan>):
|
||||
another BM806C user, identical firmware build, identical recovery
|
||||
pattern (`rmmod qmi_wwan; insmod qmi_wwan` to recover from a hung
|
||||
modem; AT-side `AT+CFUN=` resets reported as not working). Useful
|
||||
independent confirmation that the right primitive is module
|
||||
reload / USB re-enumeration, not a soft reset.
|
||||
- D-Link DWR-921 support page (firmware images, region-specific):
|
||||
hardware revision C3 on the Polish site lists firmware
|
||||
`1.01.3.006 Generic`, `1.00B07 T-Mobile`, `1.00B06 Plus/Cyfrowy Polsat
|
||||
@@ -271,9 +331,16 @@ auto-start at boot. This is a deliberate failover-only setup —
|
||||
human (or future failover script, e.g. `mwan3`) decides when to
|
||||
bring up wwan.
|
||||
|
||||
This also sidesteps a fragile boot ordering question: the modem takes
|
||||
30–90 s after boot before its QMI service is responsive, and netifd
|
||||
would otherwise repeatedly fail and back off during that window.
|
||||
This also sidesteps a fragile boot ordering question: on cold boot the
|
||||
modem's **UIM (SIM) QMI service comes up permanently broken** and never
|
||||
recovers without an explicit USB re-enumeration (`echo 0/1 >
|
||||
/sys/bus/usb/devices/1-1/authorized`). Other QMI services (CTL, NAS,
|
||||
WDS) do come up after ~5 min of warmup, but UIM does not — verified at
|
||||
uptime 21 min with no intervention. The `wwan-bringup` service handles
|
||||
the re-enumeration on boot and then calls `ifup wwan` itself; netifd
|
||||
never has to deal with the wedge directly. See
|
||||
`/root/wwan-diag/boot-wedge-investigation.md` on the router for the
|
||||
full root-cause analysis (2026-05-27).
|
||||
|
||||
### IPv6 is via a second NVRAM profile, not a single dual-stack PDP
|
||||
|
||||
@@ -508,19 +575,23 @@ In rough priority order:
|
||||
- The current "patch the file, reapply via Ansible" approach is the
|
||||
simplest and most direct. It is fine as long as the role is the
|
||||
source of truth.
|
||||
5. **Implement actual failover.** `mwan3` is the conventional choice.
|
||||
5. **Periodic session keepalive / reconnect on detach.** Now that
|
||||
boot bring-up is fast and reliable (~2:30–3:30 from cold boot to
|
||||
wwan up), the next likely failure mode is the modem getting
|
||||
deactivated by the network (`+CEER: Regular deactivation`) after
|
||||
long idle periods. A simple `procd` service that polls
|
||||
`uqmi --get-data-status` and triggers `ifup wwan` on transition
|
||||
`connected → disconnected` would close this gap. Don't pre-emptively
|
||||
add it; wait until you have evidence the problem occurs in practice
|
||||
with the workaround in place. If the disconnect comes with UIM
|
||||
going bad (same wedge signature as cold boot), the keepalive needs
|
||||
to call `wwan-bringup` (which re-authorizes the USB device) rather
|
||||
than `ifup wwan` directly.
|
||||
6. **Implement actual failover.** `mwan3` is the conventional choice.
|
||||
Alternatively a tiny shell loop that pings a target via `uplink`
|
||||
and triggers `ifup wwan` / `ifdown wwan` on transitions. Either way
|
||||
the wwan side of the work is done; the failover orchestration is a
|
||||
separate problem.
|
||||
6. **Periodic session keepalive / reconnect on detach.** Even after
|
||||
our fix, the modem can still get deactivated by the network
|
||||
(`+CEER: Regular deactivation`) after long idle periods. A simple
|
||||
`procd` service that polls `uqmi --get-data-status` and triggers
|
||||
`ifup wwan` on transition `connected → disconnected` would close
|
||||
this gap. Don't pre-emptively add it; wait until you have
|
||||
evidence the problem occurs in practice with the workaround in
|
||||
place.
|
||||
7. **Investigate `mbim` mode**. The BM806C does not currently expose
|
||||
MBIM, but the modem chipset (MDM9225) supports it at the silicon
|
||||
level. Whether there exists a magic AT command, vendor QMI message,
|
||||
@@ -570,16 +641,33 @@ In rough priority order:
|
||||
Always cross-reference with `+CEREG?` and `+CGACT?` to know if you
|
||||
are presently attached.
|
||||
- `uqmi -t 5000 -d /dev/cdc-wdm0 --get-serving-system` returns
|
||||
`"Failed to connect to service"` for the first 30–90 s after
|
||||
boot. This is the QMI service inside the modem firmware not being
|
||||
up yet, not a host-side problem.
|
||||
`"Failed to connect to service"` (or `"Unknown error"`) for the
|
||||
first ~5 minutes after cold boot. CTL/NAS/WDS *do* eventually come
|
||||
up (we measured `--get-versions` first OK at uptime 320 s,
|
||||
serving-system at 376 s), but they flap in and out for several more
|
||||
minutes. **UIM never comes up on cold boot without a USB
|
||||
re-enumeration** — `--uim-get-sim-state` keeps returning `{}` and
|
||||
`--get-imsi` keeps returning `"UIM uninitialized"` even at uptime
|
||||
21 minutes. This is why the `wwan-bringup` worker now does an
|
||||
unconditional `authorized=0/1` re-enumeration immediately after the
|
||||
modem enumerates; it is not waiting for warmup, it is forcing the
|
||||
modem to redo its init from scratch.
|
||||
- A reliable cold-boot vs. wedged-modem discriminator from AT side:
|
||||
`AT+CPIN?` returning `+CME ERROR: SIM busy` while `AT+CFUN?` returns
|
||||
`+CFUN: 1` means the modem firmware is alive but UIM is stuck. If
|
||||
this persists past uptime 5 minutes the modem will not recover on
|
||||
its own; re-authorize the USB port.
|
||||
- The diagnostic scripts we accumulated live on the router at
|
||||
`/root/wwan-diag/` (created during debugging; not part of the
|
||||
Ansible role). The most useful ones are `at.sh` (run AT commands
|
||||
through `picocom`), `ppp-test.sh` (PPP-via-AT as a control test
|
||||
that bypasses QMI), and `qmi-dual-profile.sh` (manual
|
||||
reproduction of the working `--profile`-based dual-stack flow).
|
||||
Feel free to delete them once this is stable; they are not
|
||||
that bypasses QMI), `qmi-dual-profile.sh` (manual reproduction of
|
||||
the working `--profile`-based dual-stack flow), and
|
||||
`boot-capture.sh` (instrumented per-service probe that maps the
|
||||
cold-boot wedge timeline; every probe wrapped in `/usr/bin/timeout`
|
||||
so it cannot hang). The full root-cause writeup for the boot wedge
|
||||
is at `/root/wwan-diag/boot-wedge-investigation.md`. Feel free to
|
||||
delete the older scripts once this is stable; they are not
|
||||
load-bearing.
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
Reference in New Issue
Block a user