Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
PyDial3 - Public
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
general
dsml
PyDial3 - Public
Commits
8838b1d1
Commit
8838b1d1
authored
Aug 24, 2021
by
Christian
Browse files
Options
Downloads
Patches
Plain Diff
refactored noisydqn
parent
5fa29847
No related branches found
No related tags found
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
feudalconfig.cfg
+2
-2
2 additions, 2 deletions
feudalconfig.cfg
policy/feudalgainRL/FeudalNoisyDQNPolicy.py
+1
-3
1 addition, 3 deletions
policy/feudalgainRL/FeudalNoisyDQNPolicy.py
policy/feudalgainRL/noisydqn.py
+0
-386
0 additions, 386 deletions
policy/feudalgainRL/noisydqn.py
with
3 additions
and
391 deletions
feudalconfig.cfg
+
2
−
2
View file @
8838b1d1
...
@@ -72,7 +72,7 @@ sample_argmax = False
...
@@ -72,7 +72,7 @@ sample_argmax = False
features
=
learned
features
=
learned
si_policy_type
=
acer
si_policy_type
=
acer
only_master
=
True
only_master
=
True
jsd_reward
=
Tru
e
jsd_reward
=
Fals
e
#jsd_function = tanh
#jsd_function = tanh
js_threshold
=
0.2
js_threshold
=
0.2
js_threshold_master
=
1
js_threshold_master
=
1
...
@@ -89,7 +89,7 @@ env_model_path = env_model/env1_acer_200.pkl
...
@@ -89,7 +89,7 @@ env_model_path = env_model/env1_acer_200.pkl
[dqnpolicy]
[dqnpolicy]
q_update
=
double
q_update
=
double
architecture
=
duel
architecture
=
noisy_
duel
#architecture = duel
#architecture = duel
h1_size
=
300
h1_size
=
300
h2_size
=
100
h2_size
=
100
...
...
This diff is collapsed.
Click to expand it.
policy/feudalgainRL/FeudalNoisyDQNPolicy.py
+
1
−
3
View file @
8838b1d1
...
@@ -106,7 +106,7 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
...
@@ -106,7 +106,7 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
if
cfg
.
has_option
(
'
feudalpolicy
'
,
'
actfreq_ds
'
):
if
cfg
.
has_option
(
'
feudalpolicy
'
,
'
actfreq_ds
'
):
self
.
actfreq_ds
=
cfg
.
getboolean
(
'
feudalpolicy
'
,
'
actfreq_ds
'
)
self
.
actfreq_ds
=
cfg
.
getboolean
(
'
feudalpolicy
'
,
'
actfreq_ds
'
)
self
.
use_pass
=
Tru
e
self
.
use_pass
=
Fals
e
if
cfg
.
has_option
(
'
feudalpolicy
'
,
'
use_pass
'
):
if
cfg
.
has_option
(
'
feudalpolicy
'
,
'
use_pass
'
):
self
.
use_pass
=
cfg
.
getboolean
(
'
feudalpolicy
'
,
'
use_pass
'
)
self
.
use_pass
=
cfg
.
getboolean
(
'
feudalpolicy
'
,
'
use_pass
'
)
...
@@ -320,7 +320,6 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
...
@@ -320,7 +320,6 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
logger
.
info
(
'
start training...
'
)
logger
.
info
(
'
start training...
'
)
a_batch_one_hot_new
=
None
a_batch_one_hot_new
=
None
#updating only states where the action is not "pass()" complicates things :/
#since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples
#since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples
if
self
.
js_threshold
<
1.0
or
not
self
.
use_pass
:
if
self
.
js_threshold
<
1.0
or
not
self
.
use_pass
:
...
@@ -363,7 +362,6 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
...
@@ -363,7 +362,6 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
t_batch_new
=
t_batch
t_batch_new
=
t_batch
if
self
.
js_threshold
<
1.0
or
self
.
jsd_reward
:
if
self
.
js_threshold
<
1.0
or
self
.
jsd_reward
:
#TODO: This is highly inefficient
js_divergence_batch
=
[]
js_divergence_batch
=
[]
for
belief
,
belief2
,
slot
in
zip
(
s_batch_beliefstate
,
s2_batch_beliefstate
,
s_batch_chosen_slot
):
for
belief
,
belief2
,
slot
in
zip
(
s_batch_beliefstate
,
s2_batch_beliefstate
,
s_batch_chosen_slot
):
if
slot
!=
"
None
"
:
if
slot
!=
"
None
"
:
...
...
This diff is collapsed.
Click to expand it.
policy/feudalgainRL/noisydqn.py
+
0
−
386
View file @
8838b1d1
...
@@ -29,236 +29,6 @@ Author: Pei-Hao Su
...
@@ -29,236 +29,6 @@ Author: Pei-Hao Su
"""
"""
import
tensorflow
as
tf
import
tensorflow
as
tf
# ===========================
# Deep Q Network
# ===========================
class
DeepQNetwork
(
object
):
"""
Input to the network is the state and action, output is Q(s,a).
"""
def
__init__
(
self
,
sess
,
state_dim
,
action_dim
,
learning_rate
,
tau
,
num_actor_vars
,
minibatch_size
=
64
,
architecture
=
'
duel
'
,
h1_size
=
130
,
h2_size
=
50
,
dropout_rate
=
0.
):
self
.
sess
=
sess
self
.
s_dim
=
state_dim
self
.
a_dim
=
action_dim
self
.
learning_rate
=
learning_rate
self
.
tau
=
tau
self
.
architecture
=
architecture
self
.
h1_size
=
h1_size
self
.
h2_size
=
h2_size
self
.
minibatch_size
=
minibatch_size
# Create the deep Q network
self
.
inputs
,
self
.
action
,
self
.
Qout
=
\
self
.
create_ddq_network
(
self
.
architecture
,
self
.
h1_size
,
self
.
h2_size
,
dropout_rate
=
dropout_rate
)
self
.
network_params
=
tf
.
trainable_variables
()
# Target Network
self
.
target_inputs
,
self
.
target_action
,
self
.
target_Qout
=
\
self
.
create_ddq_network
(
self
.
architecture
,
self
.
h1_size
,
self
.
h2_size
,
dropout_rate
=
dropout_rate
)
self
.
target_network_params
=
tf
.
trainable_variables
()[
len
(
self
.
network_params
):]
# Op for periodically updating target network
self
.
update_target_network_params
=
\
[
self
.
target_network_params
[
i
].
assign
(
\
tf
.
multiply
(
self
.
network_params
[
i
],
self
.
tau
)
+
tf
.
multiply
(
self
.
target_network_params
[
i
],
1.
-
self
.
tau
))
for
i
in
range
(
len
(
self
.
target_network_params
))]
# Network target (y_i)
self
.
sampled_q
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
1
])
#self.temperature = tf.placeholder(shape=None,dtype=tf.float32)
# for Boltzman exploration
#self.softmax_Q = tf.nn.softmax(self.self.Qout/self.temperature)
# Predicted Q given state and chosed action
#actions_one_hot = tf.one_hot(self.action, self.a_dim, 1.0, 0.0, name='action_one_hot')
actions_one_hot
=
self
.
action
if
architecture
!=
'
dip
'
:
self
.
pred_q
=
tf
.
reshape
(
tf
.
reduce_sum
(
self
.
Qout
*
actions_one_hot
,
axis
=
1
,
name
=
'
q_acted
'
),
[
self
.
minibatch_size
,
1
])
else
:
self
.
pred_q
=
self
.
Qout
#DIP case, not sure if will work
#self.pred_q = tf.reduce_sum(self.Qout * actions_one_hot, reduction_indices=1, name='q_acted_target')
#self.a_maxQ = tf.argmax(self.Qout, 1)
#action_maxQ_one_hot = tf.one_hot(self.a_maxQ, self.a_dim, 1.0, 0.0, name='action_maxQ_one_hot')
#self.action_maxQ_target = tf.reduce_sum(self.target_Qout * action_maxQ_one_hot, reduction_indices=1, name='a_maxQ_target')
# Define loss and optimization Op
self
.
diff
=
self
.
sampled_q
-
self
.
pred_q
self
.
loss
=
tf
.
reduce_mean
(
self
.
clipped_error
(
self
.
diff
),
name
=
'
loss
'
)
self
.
optimizer
=
tf
.
train
.
AdamOptimizer
(
self
.
learning_rate
)
self
.
optimize
=
self
.
optimizer
.
minimize
(
self
.
loss
)
# gs = tf.gradients(self.loss, self.network_params)
# capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var) for grad, var in zip(gs, self.network_params)]
#
# self.optimize = self.optimizer.apply_gradients(capped_gvs)
def
create_ddq_network
(
self
,
architecture
=
'
duel
'
,
h1_size
=
130
,
h2_size
=
50
,
dropout_rate
=
0.
):
keep_prob
=
1
-
dropout_rate
inputs
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
self
.
s_dim
])
action
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
self
.
a_dim
])
if
architecture
==
'
duel
'
:
W_fc1
=
tf
.
Variable
(
tf
.
truncated_normal
([
self
.
s_dim
,
h1_size
],
stddev
=
0.01
))
b_fc1
=
tf
.
Variable
(
tf
.
zeros
([
h1_size
]))
h_fc1
=
tf
.
nn
.
relu
(
tf
.
matmul
(
inputs
,
W_fc1
)
+
b_fc1
)
# value function
W_value
=
tf
.
Variable
(
tf
.
truncated_normal
([
h1_size
,
h2_size
],
stddev
=
0.01
))
b_value
=
tf
.
Variable
(
tf
.
zeros
([
h2_size
]))
h_value
=
tf
.
nn
.
relu
(
tf
.
matmul
(
h_fc1
,
W_value
)
+
b_value
)
W_value
=
tf
.
Variable
(
tf
.
truncated_normal
([
h2_size
,
1
],
stddev
=
0.01
))
b_value
=
tf
.
Variable
(
tf
.
zeros
([
1
]))
value_out
=
tf
.
matmul
(
h_value
,
W_value
)
+
b_value
# advantage function
W_advantage
=
tf
.
Variable
(
tf
.
truncated_normal
([
h1_size
,
h2_size
],
stddev
=
0.01
))
b_advantage
=
tf
.
Variable
(
tf
.
zeros
([
h2_size
]))
h_advantage
=
tf
.
nn
.
relu
(
tf
.
matmul
(
h_fc1
,
W_advantage
)
+
b_advantage
)
W_advantage
=
tf
.
Variable
(
tf
.
truncated_normal
([
h2_size
,
self
.
a_dim
],
stddev
=
0.01
))
b_advantage
=
tf
.
Variable
(
tf
.
zeros
([
self
.
a_dim
]))
Advantage_out
=
tf
.
matmul
(
h_advantage
,
W_advantage
)
+
b_advantage
Qout
=
value_out
+
(
Advantage_out
-
tf
.
reduce_mean
(
Advantage_out
,
axis
=
1
,
keep_dims
=
True
))
elif
architecture
==
'
noisy_duel
'
:
print
(
"
WE USE DUEL NOISY ARCHITECTURE
"
)
h_fc1
=
self
.
noisy_dense_layer
(
inputs
,
self
.
s_dim
,
h1_size
,
activation
=
tf
.
nn
.
relu
)
# value function
h_value
=
self
.
noisy_dense_layer
(
h_fc1
,
h1_size
,
h2_size
,
activation
=
tf
.
nn
.
relu
)
value_out
=
self
.
noisy_dense_layer
(
h_value
,
h2_size
,
1
)
# advantage function
h_advantage
=
self
.
noisy_dense_layer
(
h_fc1
,
h1_size
,
h2_size
,
activation
=
tf
.
nn
.
relu
)
Advantage_out
=
self
.
noisy_dense_layer
(
h_advantage
,
h2_size
,
self
.
a_dim
)
Qout
=
value_out
+
(
Advantage_out
-
tf
.
reduce_mean
(
Advantage_out
,
axis
=
1
,
keep_dims
=
True
))
elif
architecture
==
'
dip
'
:
# state network
W_fc1_s
=
tf
.
Variable
(
tf
.
truncated_normal
([
self
.
s_dim
,
h1_size
],
stddev
=
0.01
))
b_fc1_s
=
tf
.
Variable
(
tf
.
zeros
([
h1_size
]))
h_fc1_s
=
tf
.
nn
.
relu
(
tf
.
matmul
(
inputs
,
W_fc1_s
)
+
b_fc1_s
)
# action network
W_fc1_a
=
tf
.
Variable
(
tf
.
truncated_normal
([
self
.
a_dim
,
h1_size
],
stddev
=
0.01
))
b_fc1_a
=
tf
.
Variable
(
tf
.
zeros
([
h1_size
]))
h_fc1_a
=
tf
.
nn
.
relu
(
tf
.
matmul
(
action
,
W_fc1_a
)
+
b_fc1_a
)
W_fc2_s
=
tf
.
Variable
(
tf
.
truncated_normal
([
h1_size
,
h2_size
],
stddev
=
0.01
))
b_fc2_s
=
tf
.
Variable
(
tf
.
zeros
([
h2_size
]))
h_fc2_s
=
tf
.
nn
.
relu
(
tf
.
matmul
(
h_fc1_s
,
W_fc2_s
)
+
b_fc2_s
)
W_fc2_a
=
tf
.
Variable
(
tf
.
truncated_normal
([
h1_size
,
h2_size
],
stddev
=
0.01
))
b_fc2_a
=
tf
.
Variable
(
tf
.
zeros
([
h2_size
]))
h_fc2_a
=
tf
.
nn
.
relu
(
tf
.
matmul
(
h_fc1_a
,
W_fc2_a
)
+
b_fc2_a
)
Qout
=
tf
.
reduce_sum
(
tf
.
multiply
(
h_fc2_s
,
h_fc2_a
),
1
)
else
:
W_fc1
=
tf
.
Variable
(
tf
.
truncated_normal
([
self
.
s_dim
,
h1_size
],
stddev
=
0.01
))
b_fc1
=
tf
.
Variable
(
tf
.
zeros
([
h1_size
]))
h_fc1
=
tf
.
nn
.
relu
(
tf
.
matmul
(
inputs
,
W_fc1
)
+
b_fc1
)
if
keep_prob
<
1
:
h_fc1
=
tf
.
nn
.
dropout
(
h_fc1
,
keep_prob
)
W_fc2
=
tf
.
Variable
(
tf
.
truncated_normal
([
h1_size
,
h2_size
],
stddev
=
0.01
))
b_fc2
=
tf
.
Variable
(
tf
.
zeros
([
h2_size
]))
h_fc2
=
tf
.
nn
.
relu
(
tf
.
matmul
(
h_fc1
,
W_fc2
)
+
b_fc2
)
if
keep_prob
<
1
:
h_fc2
=
tf
.
nn
.
dropout
(
h_fc2
,
keep_prob
)
W_out
=
tf
.
Variable
(
tf
.
truncated_normal
([
h2_size
,
self
.
a_dim
],
stddev
=
0.01
))
b_out
=
tf
.
Variable
(
tf
.
zeros
([
self
.
a_dim
]))
Qout
=
tf
.
matmul
(
h_fc2
,
W_out
)
+
b_out
return
inputs
,
action
,
Qout
def
noisy_dense_layer
(
self
,
input
,
input_neurons
,
output_neurons
,
activation
=
tf
.
identity
):
W_mu
=
tf
.
Variable
(
tf
.
truncated_normal
([
input_neurons
,
output_neurons
],
stddev
=
0.01
))
W_sigma
=
tf
.
Variable
(
tf
.
truncated_normal
([
input_neurons
,
output_neurons
],
stddev
=
0.01
))
W_eps
=
tf
.
random_normal
(
shape
=
[
input_neurons
,
output_neurons
])
W
=
W_mu
+
tf
.
multiply
(
W_sigma
,
W_eps
)
b_mu
=
tf
.
Variable
(
tf
.
zeros
([
output_neurons
]))
b_sigma
=
tf
.
Variable
(
tf
.
zeros
([
output_neurons
]))
b_eps
=
tf
.
random_normal
(
shape
=
[
output_neurons
])
b
=
b_mu
+
tf
.
multiply
(
b_sigma
,
b_eps
)
return
activation
(
tf
.
matmul
(
input
,
W
)
+
b
)
def
train
(
self
,
inputs
,
action
,
sampled_q
):
return
self
.
sess
.
run
([
self
.
pred_q
,
self
.
optimize
,
self
.
loss
],
feed_dict
=
{
#yes, needs to be changed too
self
.
inputs
:
inputs
,
self
.
action
:
action
,
self
.
sampled_q
:
sampled_q
})
def
predict
(
self
,
inputs
):
return
self
.
sess
.
run
(
self
.
Qout
,
feed_dict
=
{
self
.
inputs
:
inputs
})
def
predict_dip
(
self
,
inputs
,
action
):
return
self
.
sess
.
run
(
self
.
Qout
,
feed_dict
=
{
self
.
inputs
:
inputs
,
self
.
action
:
action
})
def
predict_action
(
self
,
inputs
):
return
self
.
sess
.
run
(
self
.
pred_q
,
feed_dict
=
{
self
.
inputs
:
inputs
})
def
predict_target
(
self
,
inputs
):
return
self
.
sess
.
run
(
self
.
target_Qout
,
feed_dict
=
{
self
.
target_inputs
:
inputs
})
def
predict_target_dip
(
self
,
inputs
,
action
):
return
self
.
sess
.
run
(
self
.
target_Qout
,
feed_dict
=
{
self
.
target_inputs
:
inputs
,
self
.
target_action
:
action
})
def
predict_target_with_action_maxQ
(
self
,
inputs
):
return
self
.
sess
.
run
(
self
.
action_maxQ_target
,
feed_dict
=
{
self
.
target_inputs
:
inputs
,
self
.
inputs
:
inputs
})
def
update_target_network
(
self
):
self
.
sess
.
run
(
self
.
update_target_network_params
)
#yes, but no need to change
def
load_network
(
self
,
load_filename
):
self
.
saver
=
tf
.
train
.
Saver
()
if
load_filename
.
split
(
'
.
'
)[
-
3
]
!=
'
0
'
:
try
:
self
.
saver
.
restore
(
self
.
sess
,
'
./
'
+
load_filename
)
print
(
"
Successfully loaded:
"
,
load_filename
)
except
:
print
(
"
Could not find old network weights
"
)
else
:
print
(
'
nothing loaded in first iteration
'
)
def
save_network
(
self
,
save_filename
):
print
(
'
Saving deepq-network...
'
)
self
.
saver
.
save
(
self
.
sess
,
'
./
'
+
save_filename
)
# yes but no need to change
def
clipped_error
(
self
,
x
):
return
tf
.
where
(
tf
.
abs
(
x
)
<
1.0
,
0.5
*
tf
.
square
(
x
),
tf
.
abs
(
x
)
-
0.5
)
# condition, true, false
class
NNFDeepQNetwork
(
object
):
class
NNFDeepQNetwork
(
object
):
"""
"""
...
@@ -474,159 +244,3 @@ class NNFDeepQNetwork(object):
...
@@ -474,159 +244,3 @@ class NNFDeepQNetwork(object):
self
.
mean_noisy_b
.
append
(
tf
.
reduce_mean
(
tf
.
abs
(
b_sigma
)))
self
.
mean_noisy_b
.
append
(
tf
.
reduce_mean
(
tf
.
abs
(
b_sigma
)))
return
activation
(
tf
.
matmul
(
input
,
W
)
+
b
)
return
activation
(
tf
.
matmul
(
input
,
W
)
+
b
)
class
RNNFDeepQNetwork
(
object
):
"""
Input to the network is the state and action, output is Q(s,a).
"""
def
__init__
(
self
,
sess
,
si_state_dim
,
sd_state_dim
,
action_dim
,
learning_rate
,
tau
,
num_actor_vars
,
minibatch_size
=
64
,
architecture
=
'
duel
'
,
h1_size
=
130
,
h2_size
=
50
,
sd_enc_size
=
40
,
si_enc_size
=
80
,
dropout_rate
=
0.
,
slot
=
'
si
'
):
#super(NNFDeepQNetwork, self).__init__(sess, si_state_dim + sd_state_dim, action_dim, learning_rate, tau, num_actor_vars,
# minibatch_size=64, architecture='duel', h1_size=130, h2_size=50)
self
.
sess
=
sess
self
.
si_dim
=
si_state_dim
self
.
sd_dim
=
sd_state_dim
self
.
a_dim
=
action_dim
self
.
learning_rate
=
learning_rate
self
.
tau
=
tau
self
.
architecture
=
architecture
self
.
h1_size
=
h1_size
self
.
h2_size
=
h2_size
self
.
minibatch_size
=
minibatch_size
self
.
sd_enc_size
=
sd_enc_size
self
.
si_enc_size
=
si_enc_size
self
.
dropout_rate
=
dropout_rate
# Create the deep Q network
self
.
inputs
,
self
.
action
,
self
.
Qout
=
\
self
.
create_rnnfdq_network
(
self
.
h1_size
,
self
.
h2_size
,
self
.
sd_enc_size
,
self
.
si_enc_size
,
self
.
dropout_rate
,
slot
=
slot
)
self
.
network_params
=
tf
.
trainable_variables
()
# Target Network
self
.
target_inputs
,
self
.
target_action
,
self
.
target_Qout
=
\
self
.
create_rnnfdq_network
(
self
.
h1_size
,
self
.
h2_size
,
self
.
sd_enc_size
,
self
.
si_enc_size
,
self
.
dropout_rate
,
tn
=
'
target
'
,
slot
=
slot
)
self
.
target_network_params
=
tf
.
trainable_variables
()[
len
(
self
.
network_params
):]
# Op for periodically updating target network
self
.
update_target_network_params
=
\
[
self
.
target_network_params
[
i
].
assign
(
tf
.
multiply
(
self
.
network_params
[
i
],
self
.
tau
)
+
tf
.
multiply
(
self
.
target_network_params
[
i
],
1.
-
self
.
tau
))
for
i
in
range
(
len
(
self
.
target_network_params
))]
# Network target (y_i)
self
.
sampled_q
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
1
])
# Predicted Q given state and chosed action
actions_one_hot
=
self
.
action
if
architecture
!=
'
dip
'
:
self
.
pred_q
=
tf
.
reshape
(
tf
.
reduce_sum
(
self
.
Qout
*
actions_one_hot
,
axis
=
1
,
name
=
'
q_acted
'
),
[
self
.
minibatch_size
,
1
])
else
:
self
.
pred_q
=
self
.
Qout
# Define loss and optimization Op
self
.
diff
=
self
.
sampled_q
-
self
.
pred_q
self
.
loss
=
tf
.
reduce_mean
(
self
.
clipped_error
(
self
.
diff
),
name
=
'
loss
'
)
self
.
optimizer
=
tf
.
train
.
AdamOptimizer
(
self
.
learning_rate
)
self
.
optimize
=
self
.
optimizer
.
minimize
(
self
.
loss
)
#def create_slot_encoder(self):
def
create_rnnfdq_network
(
self
,
h1_size
=
130
,
h2_size
=
50
,
sd_enc_size
=
40
,
si_enc_size
=
80
,
dropout_rate
=
0.
,
tn
=
'
normal
'
,
slot
=
'
si
'
):
inputs
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
self
.
sd_dim
+
self
.
si_dim
])
keep_prob
=
1
-
dropout_rate
sd_inputs
,
si_inputs
=
tf
.
split
(
inputs
,
[
self
.
sd_dim
,
self
.
si_dim
],
1
)
action
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
self
.
a_dim
])
if
slot
==
'
sd
'
:
sd_inputs
=
tf
.
reshape
(
sd_inputs
,
(
tf
.
shape
(
sd_inputs
)[
0
],
1
,
self
.
sd_dim
))
#slots encoder
with
tf
.
variable_scope
(
tn
):
#try:
lstm_cell
=
tf
.
nn
.
rnn_cell
.
GRUCell
(
self
.
sd_enc_size
)
hidden_state
=
lstm_cell
.
zero_state
(
tf
.
shape
(
sd_inputs
)[
0
],
tf
.
float32
)
_
,
h_sdfe
=
tf
.
nn
.
dynamic_rnn
(
lstm_cell
,
sd_inputs
,
initial_state
=
hidden_state
)
#except:
# lstm_cell = tf.contrib.rnn.GRUCell(self.sd_enc_size)
# hidden_state = lstm_cell.zero_state(tf.shape(sd_inputs)[0], tf.float32)
# _, h_sdfe = tf.contrib.rnn.dynamic_rnn(lstm_cell, sd_inputs, initial_state=hidden_state)
else
:
W_sdfe
=
tf
.
Variable
(
tf
.
truncated_normal
([
self
.
sd_dim
,
sd_enc_size
],
stddev
=
0.01
))
b_sdfe
=
tf
.
Variable
(
tf
.
zeros
([
sd_enc_size
]))
h_sdfe
=
tf
.
nn
.
relu
(
tf
.
matmul
(
sd_inputs
,
W_sdfe
)
+
b_sdfe
)
if
keep_prob
<
1
:
h_sdfe
=
tf
.
nn
.
dropout
(
h_sdfe
,
keep_prob
)
W_sife
=
tf
.
Variable
(
tf
.
truncated_normal
([
self
.
si_dim
,
si_enc_size
],
stddev
=
0.01
))
b_sife
=
tf
.
Variable
(
tf
.
zeros
([
si_enc_size
]))
h_sife
=
tf
.
nn
.
relu
(
tf
.
matmul
(
si_inputs
,
W_sife
)
+
b_sife
)
if
keep_prob
<
1
:
h_sife
=
tf
.
nn
.
dropout
(
h_sife
,
keep_prob
)
W_fc1
=
tf
.
Variable
(
tf
.
truncated_normal
([
sd_enc_size
+
si_enc_size
,
h1_size
],
stddev
=
0.01
))
b_fc1
=
tf
.
Variable
(
tf
.
zeros
([
h1_size
]))
h_fc1
=
tf
.
nn
.
relu
(
tf
.
matmul
(
tf
.
concat
((
h_sdfe
,
h_sife
),
1
),
W_fc1
)
+
b_fc1
)
W_fc2
=
tf
.
Variable
(
tf
.
truncated_normal
([
h1_size
,
h2_size
],
stddev
=
0.01
))
b_fc2
=
tf
.
Variable
(
tf
.
zeros
([
h2_size
]))
h_fc2
=
tf
.
nn
.
relu
(
tf
.
matmul
(
h_fc1
,
W_fc2
)
+
b_fc2
)
W_out
=
tf
.
Variable
(
tf
.
truncated_normal
([
h2_size
,
self
.
a_dim
],
stddev
=
0.01
))
b_out
=
tf
.
Variable
(
tf
.
zeros
([
self
.
a_dim
]))
Qout
=
tf
.
matmul
(
h_fc2
,
W_out
)
+
b_out
return
inputs
,
action
,
Qout
def
predict
(
self
,
inputs
):
return
self
.
sess
.
run
(
self
.
Qout
,
feed_dict
=
{
#inputs where a single flat_bstate
self
.
inputs
:
inputs
})
def
predict_dip
(
self
,
inputs
,
action
):
return
self
.
sess
.
run
(
self
.
Qout
,
feed_dict
=
{
#inputs and action where array of 64 (batch size)
self
.
inputs
:
inputs
,
self
.
action
:
action
})
def
predict_target
(
self
,
inputs
):
return
self
.
sess
.
run
(
self
.
target_Qout
,
feed_dict
=
{
#inputs where a single flat_bstate
self
.
target_inputs
:
inputs
})
def
predict_target_dip
(
self
,
inputs
,
action
):
return
self
.
sess
.
run
(
self
.
target_Qout
,
feed_dict
=
{
#inputs and action where array of 64 (batch size)
self
.
target_inputs
:
inputs
,
self
.
target_action
:
action
})
def
train
(
self
,
inputs
,
action
,
sampled_q
):
return
self
.
sess
.
run
([
self
.
pred_q
,
self
.
optimize
,
self
.
loss
],
feed_dict
=
{
#all the inputs are arrays of 64
self
.
inputs
:
inputs
,
self
.
action
:
action
,
self
.
sampled_q
:
sampled_q
})
def
clipped_error
(
self
,
x
):
return
tf
.
where
(
tf
.
abs
(
x
)
<
1.0
,
0.5
*
tf
.
square
(
x
),
tf
.
abs
(
x
)
-
0.5
)
# condition, true, false
def
save_network
(
self
,
save_filename
):
print
(
'
Saving deepq-network...
'
)
self
.
saver
.
save
(
self
.
sess
,
save_filename
)
def
update_target_network
(
self
):
self
.
sess
.
run
(
self
.
update_target_network_params
)
def
load_network
(
self
,
load_filename
):
self
.
saver
=
tf
.
train
.
Saver
()
if
load_filename
.
split
(
'
.
'
)[
-
3
]
!=
'
0
'
:
try
:
self
.
saver
.
restore
(
self
.
sess
,
load_filename
)
print
(
"
Successfully loaded:
"
,
load_filename
)
except
:
print
(
"
Could not find old network weights
"
)
else
:
print
(
'
nothing loaded in first iteration
'
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment