规划理论与方法
欢迎学习规划理论与方法!本模块基于 Malik Ghallab、Dana Nau 和 Paolo Traverso 的经典教材 Acting, Planning, and Learning ,系统地讲解规划理论的核心概念和方法。
什么是规划?
┌─────────────────────────────────────────────────────────────┐
│ 规划的定义 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 规划是寻找一系列动作的过程, │
│ 使得智能体从初始状态达到目标状态。 │
│ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ 初始状态 │───▶│ 动作序列 │───▶│ 目标状态 │ │
│ │ (Initial) │ │ (Plan) │ │ (Goal) │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
│ │
│ 规划的核心问题: │
│ - 如何表示状态和动作? │
│ - 如何搜索有效的动作序列? │
│ - 如何处理不确定性? │
│ - 如何实现实时规划? │
│ │
└─────────────────────────────────────────────────────────────┘
学习路线图
flowchart TB
A([开始学习规划]) --> B[规划问题基础]
B --> C[确定性规划]
C --> D[概率规划]
D --> E[强化学习]
E --> F[非确定性规划]
F --> G[时间规划]
G --> H[运动规划]
H --> I[规划与学习]
B -.-> B1[状态空间]
B -.-> B2[动作表示]
B -.-> B3[目标描述]
C -.-> C1[状态空间搜索]
C -.-> C2[启发式搜索]
C -.-> C3[规划空间搜索]
D -.-> D1[MDP]
D -.-> D2[值迭代]
D -.-> D3[策略迭代]
E -.-> E1[Q-Learning]
E -.-> E2[策略梯度]
E -.-> E3[深度强化学习]
F -.-> F1[And/Or图]
F -.-> F2[确定化技术]
F -.-> F3[模型检测]
G -.-> G1[时间约束]
G -.-> G2[时间网络]
G -.-> G3[时间规划器]
H -.-> H1[路径规划]
H -.-> H2[运动规划]
H -.-> H3[操作规划]
课程结构
第一部分:规划问题基础(2周)
周次
主题
内容
链接
第1周
规划问题概述
状态空间、动作表示、目标描述
规划问题
第2周
规划算法基础
搜索算法、图搜索、树搜索
算法基础
第二部分:确定性规划(3周)
周次
主题
内容
链接
第3周
状态空间搜索
前向搜索、后向搜索
确定性规划
第4周
启发式搜索
启发函数、A*算法
确定性规划
第5周
规划空间搜索
部分有序规划、规划修复
确定性规划
第三部分:概率规划(3周)
周次
主题
内容
链接
第6周
MDP 基础
马尔可夫决策过程
概率规划
第7周
动态规划
值迭代、策略迭代
概率规划
第8周
强化学习
Q-Learning、策略梯度
强化学习
第四部分:高级规划方法(4周)
周次
主题
内容
链接
第9周
非确定性规划
And/Or图、确定化技术
非确定性规划
第10周
时间规划
时间约束、时间网络
时间规划
第11周
运动规划
路径规划、运动规划
运动规划
第12周
规划与学习
层次化规划、学习规划
规划与学习
核心概念速览
1. 状态空间
# 状态空间表示
class State :
def __init__ ( self , robot_pos , objects ):
self . robot_pos = robot_pos # 机器人位置
self . objects = objects # 物体状态
def __eq__ ( self , other ):
return ( self . robot_pos == other . robot_pos and
self . objects == other . objects )
def __hash__ ( self ):
return hash (( self . robot_pos , tuple ( self . objects . items ())))
# 示例
initial_state = State (
robot_pos = ( 0 , 0 ),
objects = { 'box' : ( 1 , 1 ), 'goal' : ( 2 , 2 )}
)
2. 动作表示
# 动作表示
class Action :
def __init__ ( self , name , preconditions , effects ):
self . name = name
self . preconditions = preconditions # 前置条件
self . effects = effects # 效果
def is_applicable ( self , state ):
"""检查动作是否可应用"""
return all ( pred ( state ) for pred in self . preconditions )
def apply ( self , state ):
"""应用动作到状态"""
new_state = state . copy ()
for effect in self . effects :
effect ( new_state )
return new_state
# 示例:移动动作
def move_action ( direction ):
def precondition ( state ):
# 检查是否可以移动
return True
def effect ( state ):
# 更新机器人位置
x , y = state . robot_pos
if direction == 'up' :
state . robot_pos = ( x , y + 1 )
elif direction == 'down' :
state . robot_pos = ( x , y - 1 )
elif direction == 'left' :
state . robot_pos = ( x - 1 , y )
elif direction == 'right' :
state . robot_pos = ( x + 1 , y )
return Action ( f 'move_ { direction } ' , [ precondition ], [ effect ])
3. 规划问题
# 规划问题定义
class PlanningProblem :
def __init__ ( self , initial_state , goal_state , actions ):
self . initial_state = initial_state
self . goal_state = goal_state
self . actions = actions
def is_goal ( self , state ):
"""检查是否达到目标"""
return state == self . goal_state
def get_applicable_actions ( self , state ):
"""获取可应用的动作"""
return [ a for a in self . actions if a . is_applicable ( state )]
# 示例
problem = PlanningProblem (
initial_state = initial_state ,
goal_state = goal_state ,
actions = [ move_action ( 'up' ), move_action ( 'down' ),
move_action ( 'left' ), move_action ( 'right' )]
)
4. 搜索算法
# 广度优先搜索
def breadth_first_search ( problem ):
"""广度优先搜索"""
from collections import deque
frontier = deque ([ problem . initial_state ])
explored = set ()
parent = { problem . initial_state : None }
action_taken = { problem . initial_state : None }
while frontier :
state = frontier . popleft ()
if problem . is_goal ( state ):
# 重建路径
path = []
while state is not None :
if action_taken [ state ] is not None :
path . append ( action_taken [ state ])
state = parent [ state ]
return path [:: - 1 ]
explored . add ( state )
for action in problem . get_applicable_actions ( state ):
new_state = action . apply ( state )
if new_state not in explored and new_state not in frontier :
frontier . append ( new_state )
parent [ new_state ] = state
action_taken [ new_state ] = action . name
return None # 无解
5. 启发式搜索
import heapq
def a_star_search ( problem , heuristic ):
"""A* 搜索算法"""
# 优先队列:(f值, 状态)
frontier = [( heuristic ( problem . initial_state ), problem . initial_state )]
explored = set ()
g_score = { problem . initial_state : 0 }
parent = { problem . initial_state : None }
action_taken = { problem . initial_state : None }
while frontier :
f , state = heapq . heappop ( frontier )
if problem . is_goal ( state ):
# 重建路径
path = []
while state is not None :
if action_taken [ state ] is not None :
path . append ( action_taken [ state ])
state = parent [ state ]
return path [:: - 1 ]
if state in explored :
continue
explored . add ( state )
for action in problem . get_applicable_actions ( state ):
new_state = action . apply ( state )
tentative_g = g_score [ state ] + 1 # 假设每步代价为1
if new_state not in g_score or tentative_g < g_score [ new_state ]:
g_score [ new_state ] = tentative_g
f_score = tentative_g + heuristic ( new_state )
heapq . heappush ( frontier , ( f_score , new_state ))
parent [ new_state ] = state
action_taken [ new_state ] = action . name
return None # 无解
# 启发函数示例:曼哈顿距离
def manhattan_distance ( state , goal ):
"""曼哈顿距离启发函数"""
x1 , y1 = state . robot_pos
x2 , y2 = goal . robot_pos
return abs ( x1 - x2 ) + abs ( y1 - y2 )
6. MDP(马尔可夫决策过程)
# MDP 表示
class MDP :
def __init__ ( self , states , actions , transition_model , reward_function , discount_factor ):
self . states = states
self . actions = actions
self . transition_model = transition_model # P(s'|s,a)
self . reward_function = reward_function # R(s,a,s')
self . discount_factor = discount_factor # γ
def get_transition_probability ( self , state , action , next_state ):
"""获取转移概率"""
return self . transition_model ( state , action , next_state )
def get_reward ( self , state , action , next_state ):
"""获取奖励"""
return self . reward_function ( state , action , next_state )
# 值迭代算法
def value_iteration ( mdp , epsilon = 0.01 ):
"""值迭代算法"""
# 初始化值函数
V = { s : 0 for s in mdp . states }
while True :
delta = 0
V_new = V . copy ()
for s in mdp . states :
# 计算每个动作的 Q 值
q_values = []
for a in mdp . actions :
q = sum (
mdp . get_transition_probability ( s , a , s_prime ) *
( mdp . get_reward ( s , a , s_prime ) + mdp . discount_factor * V [ s_prime ])
for s_prime in mdp . states
)
q_values . append ( q )
# 更新值函数
V_new [ s ] = max ( q_values )
delta = max ( delta , abs ( V_new [ s ] - V [ s ]))
V = V_new
# 检查收敛
if delta < epsilon :
break
# 提取策略
policy = {}
for s in mdp . states :
q_values = []
for a in mdp . actions :
q = sum (
mdp . get_transition_probability ( s , a , s_prime ) *
( mdp . get_reward ( s , a , s_prime ) + mdp . discount_factor * V [ s_prime ])
for s_prime in mdp . states
)
q_values . append (( q , a ))
policy [ s ] = max ( q_values )[ 1 ]
return V , policy
7. Q-Learning
import numpy as np
class QLearning :
"""Q-Learning 算法"""
def __init__ ( self , states , actions , learning_rate = 0.1 , discount_factor = 0.9 ,
exploration_rate = 0.1 ):
self . states = states
self . actions = actions
self . learning_rate = learning_rate
self . discount_factor = discount_factor
self . exploration_rate = exploration_rate
# 初始化 Q 表
self . q_table = {( s , a ): 0 for s in states for a in actions }
def choose_action ( self , state ):
"""选择动作(ε-贪心策略)"""
if np . random . random () < self . exploration_rate :
return np . random . choice ( self . actions )
else :
q_values = [ self . q_table [( state , a )] for a in self . actions ]
return self . actions [ np . argmax ( q_values )]
def update ( self , state , action , reward , next_state ):
"""更新 Q 值"""
# 获取下一个状态的最大 Q 值
max_q_next = max ( self . q_table [( next_state , a )] for a in self . actions )
# 更新 Q 值
current_q = self . q_table [( state , action )]
self . q_table [( state , action )] = current_q + self . learning_rate * (
reward + self . discount_factor * max_q_next - current_q
)
def get_policy ( self ):
"""获取策略"""
policy = {}
for s in self . states :
q_values = [ self . q_table [( s , a )] for a in self . actions ]
policy [ s ] = self . actions [ np . argmax ( q_values )]
return policy
规划方法分类
方法
特点
适用场景
确定性规划
状态转移确定
已知环境
概率规划
状态转移概率已知
不确定环境
强化学习
通过试错学习
未知环境
非确定性规划
状态转移不确定
最坏情况
时间规划
考虑时间约束
实时系统
运动规划
连续空间
机器人控制
层次化规划
任务分解
复杂任务
学习建议
1. 循序渐进
基础 → 进阶 → 应用
│ │ │
▼ ▼ ▼
状态空间 → MDP → 强化学习
搜索算法 → 启发式 → 概率规划
动作表示 → 时间规划 → 运动规划
2. 理论与实践结合
# 理论学习
# 1. 理解规划问题定义
# 2. 掌握搜索算法
# 3. 理解 MDP 和强化学习
# 实践应用
# 1. Python 实现
# 2. ROS 集成
# 3. 仿真验证
3. 可视化理解
import matplotlib.pyplot as plt
import numpy as np
# 可视化规划问题
def visualize_grid_world ( states , obstacles , goal , path = None ):
"""可视化网格世界"""
grid = np . zeros (( 10 , 10 ))
# 标记障碍物
for obs in obstacles :
grid [ obs ] = - 1
# 标记目标
grid [ goal ] = 2
# 标记路径
if path :
for pos in path :
grid [ pos ] = 1
plt . figure ( figsize = ( 8 , 8 ))
plt . imshow ( grid , cmap = 'YlOrRd' )
plt . colorbar ()
plt . title ( 'Grid World' )
plt . show ()
参考资源
教材
Acting, Planning, and Learning - Malik Ghallab, Dana Nau, Paolo Traverso
Planning Algorithms - Steven M. LaValle
Artificial Intelligence: A Modern Approach - Stuart Russell, Peter Norvig
Reinforcement Learning: An Introduction - Richard S. Sutton, Andrew G. Barto
在线课程
Coursera: Robotics Specialization
edX: Robotics
MIT 6.800: Planning and Decision Making
工具
PDDL - 规划领域定义语言
ROS Navigation Stack - 机器人导航
OpenAI Gym - 强化学习环境
下一步
选择一个主题开始学习:
规划问题
算法基础
确定性规划
概率规划
← 返回首页