From 14da5ec311891859489a63a04faa83081d073ac8 Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Sun, 15 Mar 2020 22:33:05 +0300
Subject: [PATCH 1/5] LSTM scalar

---
 modules/dnn/src/layers/recurrent_layers.cpp   |  10 ++
 .../dnn/src/onnx/onnx_graph_simplifier.cpp    |  25 +++
 modules/dnn/src/onnx/onnx_importer.cpp        | 165 ++++++++++++++++--
 modules/dnn/src/tensorflow/tf_importer.cpp    |   7 +
 modules/dnn/test/test_onnx_importer.cpp       |  11 ++
 5 files changed, 204 insertions(+), 14 deletions(-)
diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index 3f9a229516..a3962db127 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -215,6 +215,8 @@ public:
         internals.push_back(shape(_numSamples, 1)); // dummyOnes
         internals.push_back(shape(_numSamples, 4*_numOut)); // gates
 
+
+        std::cout << "LSTM out: " << outputs[0] << '\n';
         return false;
     }
 
@@ -301,6 +303,8 @@ public:
             tsEnd = numTimeStamps;
             tsInc = 1;
         }
+        std::cout << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" << '\n';
+        std::cout << tsStart << " " << tsEnd << '\n';
         for (int ts = tsStart; ts != tsEnd; ts += tsInc)
         {
             Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
@@ -314,6 +318,7 @@ public:
             Mat gateF = gates.colRange(1*numOut, 2*numOut);
             Mat gateO = gates.colRange(2*numOut, 3*numOut);
             Mat gateG = gates.colRange(3*numOut, 4*numOut);
+            std::cout << "i " << gateI << '\n';
 
             if (forgetBias)
                 add(gateF, forgetBias, gateF);
@@ -329,6 +334,7 @@ public:
             {
                 Mat gatesIFO = gates.colRange(0, 3*numOut);
                 sigmoid(gatesIFO, gatesIFO);
+                std::cout << "ifo " << gatesIFO << '\n';
             }
 
             tanh(gateG, gateG);
@@ -345,12 +351,15 @@ public:
             }
             if (usePeephole)
             {
+                std::cout << "if (usePeephole)" << '\n';
                 gemm(cInternal, blobs[5], 1, gateO, 1, gateO);
                 sigmoid(gateO, gateO);
             }
 
             //compute h_t
             tanh(cInternal, hInternal);
+            std::cout << "o " << gateO << '\n';
+            std::cout << "tanh(o) " << hInternal << '\n';
             multiply(gateO, hInternal, hInternal);
 
             //save results in output blobs
@@ -358,6 +367,7 @@ public:
             if (produceCellOutput)
                 cInternal.copyTo(cOutTs.rowRange(curRowRange));
         }
+        std::cout << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" << '\n';
     }
 };
 
diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
index fe96927840..6693a75ff4 100644
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@@ -290,6 +290,30 @@ public:
     }
 };
 
+// // To remove Squeeze after LSTM for non-bidirectional LSTM
+// class LSTMSqueeze : public Subgraph
+// {
+// public:
+//     LSTMSqueeze()
+//     {
+//         int input = addNodeToMatch("");
+//
+//         std::vector<int> lstmInps(7);
+//         lstmInps[0] = input;
+//
+//         for (int i = 1; i < 4; ++i)
+//             lstmInps[i] = addNodeToMatch("Unsqueeze");
+//         lstmInps[4] = addNodeToMatch("");
+//         for (int i = 5; i < 7; ++i)
+//             lstmInps[i] = addNodeToMatch("ConstantOfShape");
+//
+//         int lstm = addNodeToMatch("LSTM", lstmInps);
+//         addNodeToMatch("Squeeze", lstm);
+//
+//         setFusedNode("LSTM", lstmInps);
+//     }
+// };
+
 void simplifySubgraphs(opencv_onnx::GraphProto& net)
 {
     std::vector<Ptr<Subgraph> > subgraphs;
@@ -299,6 +323,7 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
     subgraphs.push_back(makePtr<ResizeSubgraph1>());
     subgraphs.push_back(makePtr<ResizeSubgraph2>());
     subgraphs.push_back(makePtr<SoftMaxSubgraph>());
+    // subgraphs.push_back(makePtr<LSTMSqueeze>());
 
     simplifySubgraphs(Ptr<ImportGraphWrapper>(new ONNXGraphWrapper(net)), subgraphs);
 }
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 7913fa729d..bcf3d28eed 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -322,7 +322,7 @@ void ONNXImporter::populateNet(Net dstNet)
 
         std::string layer_type = node_proto.op_type();
         layerParams.type = layer_type;
-
+        std::cout << layerParams.name << " " << layer_type << '\n';
 
         if (layer_type == "MaxPool")
         {
@@ -457,6 +457,19 @@ void ONNXImporter::populateNet(Net dstNet)
                 constBlobs.insert(std::make_pair(layerParams.name, sliced[0]));
                 continue;
             }
+
+            layerParams.set("begin", DictValue::arrayInt(&begin[0], begin.size()));
+            layerParams.set("end", DictValue::arrayInt(&end[0], end.size()));
+
+            CV_Assert(node_proto.input_size() == 1);
+            if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
+            {
+                std::vector<Mat> inputs(1, getBlob(node_proto, constBlobs, 0)), sliced;
+                runLayer(layerParams, inputs, sliced);
+                CV_Assert(sliced.size() == 1);
+                constBlobs.insert(std::make_pair(layerParams.name, sliced[0]));
+                continue;
+            }
         }
         else if (layer_type == "Split")
         {
@@ -579,6 +592,117 @@ void ONNXImporter::populateNet(Net dstNet)
             constBlobs.insert(std::make_pair(layerParams.name, layerParams.blobs[0]));
             continue;
         }
+        else if (layer_type == "ConstantFill" || layer_type == "ConstantOfShape")
+        {
+            CV_Assert_N(node_proto.input_size());
+            MatShape inpShape = getBlob(node_proto, constBlobs, 0);
+            float value = layerParams.get("value", 0);
+            Mat fill(inpShape.size(), &inpShape[0], CV_32F, Scalar(value));
+            constBlobs.insert(std::make_pair(layerParams.name, fill));
+            continue;
+        }
+        else if (layer_type == "LSTM")
+        {
+            std::cout << "~~~~~~" << '\n';
+            std::cout << layerParams << '\n';
+            for (int i = 1; i < node_proto.input_size(); ++i) {
+              std::cout << "i: " << node_proto.input(i) << " " << constBlobs[node_proto.input(i)].size << '\n';
+            }
+
+            CV_Assert(node_proto.input_size() == 7);
+            Mat Wx = getBlob(node_proto, constBlobs, 1);
+            Mat Wh = getBlob(node_proto, constBlobs, 2);
+            Mat b = getBlob(node_proto, constBlobs, 3);
+
+
+            std::cout << Wx.size << '\n';
+            std::cout << Wh.size << '\n';
+
+            int Wx_shape[] = {Wx.size[1], Wx.size[2]};
+            int Wh_shape[] = {Wh.size[1], Wh.size[2]};
+            std::cout << "b.size " <<  b.size << '\n';
+            int b_shape[] = {2, b.size[1] / 2};
+
+            Wx = Wx.reshape(1, 2, &Wx_shape[0]);
+            b = b.reshape(1, 2, &b_shape[0]);
+
+            std::cout << "b ----------------" << '\n';
+
+            std::cout << b << '\n';
+            reduce(b, b, 0, REDUCE_SUM);
+            std::cout << b << '\n';
+
+            // https://pytorch.org/docs/stable/nn.html#lstm
+            // IFGO->IFOG
+            // swap each 3rd and 4th rows
+            // Wx = Wx.t();
+
+            float* weightData = (float*)Wx.data;
+            std::swap(weightData[1], weightData[2]);
+
+            float* biasData = (float*)b.data;
+            std::swap(biasData[1], biasData[2]);
+
+            // std::swap(weightData[2], weightData[3]);
+            //
+            // weightData = (float*)Wh.data;
+            // std::swap(weightData[1], weightData[2]);
+            // std::swap(weightData[2], weightData[3]);
+
+
+            // const int outSize = Wx.cols / 4;
+            // for (int i = 0; i < Wx.rows; ++i)
+            //     for (int j = 0; j < outSize; ++j)
+            //     {
+            //         // std::swap(weightData[i * W.cols + 1 * outSize + j],
+            //         //           weightData[i * W.cols + 2 * outSize + j]);
+            //         std::swap(weightData[i * Wx.cols + 2 * outSize + j],
+            //                   weightData[i * Wx.cols + 3 * outSize + j]);
+            //     }
+
+            // float* weightData = Wx.ptr<float>();
+            // for (int j = 0; j < 5; ++j)
+            // {
+            //     std::cout << "swap " << (10 + j) << " " << (15 + j) << '\n';
+            //     for (int i = 0; i < 12; ++i)
+            //         std::swap(weightData[(10 + j) * 12 + i],
+            //                   weightData[(15 + j) * 12 + i]);
+            // }
+
+            layerParams.blobs.resize(3);
+            layerParams.blobs[0] = Wh.reshape(1, 2, &Wh_shape[0]);
+            layerParams.blobs[1] = Wx;
+            layerParams.blobs[2] = b;
+
+            std::cout << "Wx" << '\n';
+            std::cout << layerParams.blobs[1] << '\n';
+
+            std::cout << "Wh" << '\n';
+            std::cout << layerParams.blobs[0] << '\n';
+
+            // layerParams.set("reverse", true);
+
+
+            // layerParams.set("use_peephole", true);
+            // layerParams.blobs.resize(6);
+            // for (int i = 0; i < 3; ++i)
+            // {
+            //     Mat w = Mat::eye(layerParams.blobs[0].cols, layerParams.blobs[0].cols, CV_32F);
+            //     layerParams.blobs[3 + i] = w;
+            // }
+
+            // std::cout << layerParams.blobs[1] << '\n';
+
+            // int lstmId = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
+            //
+            // layerParams = LayerParams();
+            //
+            // // Add reshape
+            // int shape[] = {1, 10, 11, 5};
+            // layerParams.name = node_proto.output(0) + "/reshape";
+            // layerParams.type = "Reshape";
+            // layerParams.set("dim", DictValue::arrayInt(&shape[0], 4));
+        }
         else if (layer_type == "ImageScaler")
         {
             const float scale = layerParams.has("scale") ? layerParams.get<float>("scale") : 1.0f;
@@ -881,14 +1005,14 @@ void ONNXImporter::populateNet(Net dstNet)
         else if (layer_type == "Squeeze")
         {
             CV_Assert_N(node_proto.input_size() == 1, layerParams.has("axes"));
-            DictValue axes_dict = layerParams.get("axes");
-            if (axes_dict.size() != 1)
-                CV_Error(Error::StsNotImplemented, "Multidimensional squeeze");
-
-            int axis = axes_dict.getIntValue(0);
-            layerParams.set("axis", axis - 1);
-            layerParams.set("end_axis", axis);
-            layerParams.type = "Flatten";
+            // DictValue axes_dict = layerParams.get("axes");
+            // if (axes_dict.size() != 1)
+            //     CV_Error(Error::StsNotImplemented, "Multidimensional squeeze");
+            //
+            // int axis = axes_dict.getIntValue(0);
+            // layerParams.set("axis", axis - 1);
+            // layerParams.set("end_axis", axis);
+            layerParams.type = "Identity";
         }
         else if (layer_type == "Flatten")
         {
@@ -1032,17 +1156,30 @@ void ONNXImporter::populateNet(Net dstNet)
         else if (layer_type == "Gather")
         {
             CV_Assert(node_proto.input_size() == 2);
-            CV_Assert(layerParams.has("axis"));
             Mat input = getBlob(node_proto, constBlobs, 0);
             Mat indexMat = getBlob(node_proto, constBlobs, 1);
             CV_Assert_N(indexMat.type() == CV_32S, indexMat.total() == 1);
             int index = indexMat.at<int>(0);
-            int axis = layerParams.get<int>("axis");
 
-            std::vector<cv::Range> ranges(input.dims, Range::all());
-            ranges[axis] = Range(index, index + 1);
+            Mat out;
+            if (layerParams.has("axis"))
+            {
+                int axis = layerParams.get<int>("axis");
 
-            Mat out = input(ranges);
+                std::vector<cv::Range> ranges(input.dims, Range::all());
+                ranges[axis] = Range(index, index + 1);
+
+                out = input(ranges);
+            }
+            else
+            {
+                CV_Assert(index < input.total());
+                const int dims = input.dims;
+                input = input.reshape(1, 1);
+                input.dims = 2;
+                out = input.reshape(1, 1).colRange(index, index + 1);
+                out.dims = dims;
+            }
             constBlobs.insert(std::make_pair(layerParams.name, out));
             continue;
         }
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index fe7e47f7a0..60ba6d39c5 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -1826,10 +1826,12 @@ void TFImporter::populateNet(Net dstNet)
             const int outSize = W.cols / 4;
 
             // IGFO->IFOG
+            std::cout << "(TF) W " << W.size << '\n';
             float* weightData = (float*)W.data;
             for (int i = 0; i < W.rows; ++i)
                 for (int j = 0; j < outSize; ++j)
                 {
+                    // std::cout << "swap " << i * W.cols + 1 * outSize << " " << i * W.cols + 2 * outSize << '\n';
                     std::swap(weightData[i * W.cols + 1 * outSize + j],
                               weightData[i * W.cols + 2 * outSize + j]);
                     std::swap(weightData[i * W.cols + 2 * outSize + j],
@@ -1838,6 +1840,11 @@ void TFImporter::populateNet(Net dstNet)
             Wx = W.rowRange(0, W.rows - outSize).t();
             Wh = W.rowRange(W.rows - outSize, W.rows).t();
 
+            std::cout << "(TF) Wx " << Wx.size << '\n';
+            std::cout << "(TF) Wh " << Wh.size << '\n';
+            std::cout << "(TF) b " << b.size << '\n';
+
+
             layerParams.blobs.resize(3);
             layerParams.blobs[0] = Wh;
             layerParams.blobs[1] = Wx;
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 40110d2542..c5b243b8ab 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -79,6 +79,12 @@ public:
             netSoftmax.setInput(ref);
             ref = netSoftmax.forward();
         }
+        std::cout << "ref: " << ref.size << '\n';
+        std::cout << "out: " << out.size << '\n';
+        std::cout << ref.reshape(1, 1) << '\n';
+        std::cout << '\n';
+        std::cout << out.reshape(1, 1) << '\n';
+
         normAssert(ref, out, "", l1 ? l1 : default_l1, lInf ? lInf : default_lInf);
         if (checkNoFallbacks)
             expectNoFallbacksFromIE(net);
@@ -451,6 +457,11 @@ TEST_P(Test_ONNX_layers, Split_EltwiseMax)
     testONNXModels("split_max");
 }
 
+TEST_P(Test_ONNX_layers, LSTM)
+{
+    testONNXModels("lstm");
+}
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_ONNX_layers, dnnBackendsAndTargets());
 
 class Test_ONNX_nets : public Test_ONNX_layers

From 8d69dbdf49f52c3610187753430de293dce823d0 Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Sun, 15 Mar 2020 23:21:58 +0300
Subject: [PATCH 2/5] LSTM from ONNX works

---
 modules/dnn/src/layers/recurrent_layers.cpp   |  10 -
 .../dnn/src/onnx/onnx_graph_simplifier.cpp    |  25 ---
 modules/dnn/src/onnx/onnx_importer.cpp        | 186 +++++++-----------
 modules/dnn/src/tensorflow/tf_importer.cpp    |   7 -
 modules/dnn/test/test_onnx_importer.cpp       |   6 -
 5 files changed, 66 insertions(+), 168 deletions(-)

diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index a3962db127..3f9a229516 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -215,8 +215,6 @@ public:
         internals.push_back(shape(_numSamples, 1)); // dummyOnes
         internals.push_back(shape(_numSamples, 4*_numOut)); // gates
 
-
-        std::cout << "LSTM out: " << outputs[0] << '\n';
         return false;
     }
 
@@ -303,8 +301,6 @@ public:
             tsEnd = numTimeStamps;
             tsInc = 1;
         }
-        std::cout << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" << '\n';
-        std::cout << tsStart << " " << tsEnd << '\n';
         for (int ts = tsStart; ts != tsEnd; ts += tsInc)
         {
             Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
@@ -318,7 +314,6 @@ public:
             Mat gateF = gates.colRange(1*numOut, 2*numOut);
             Mat gateO = gates.colRange(2*numOut, 3*numOut);
             Mat gateG = gates.colRange(3*numOut, 4*numOut);
-            std::cout << "i " << gateI << '\n';
 
             if (forgetBias)
                 add(gateF, forgetBias, gateF);
@@ -334,7 +329,6 @@ public:
             {
                 Mat gatesIFO = gates.colRange(0, 3*numOut);
                 sigmoid(gatesIFO, gatesIFO);
-                std::cout << "ifo " << gatesIFO << '\n';
             }
 
             tanh(gateG, gateG);
@@ -351,15 +345,12 @@ public:
             }
             if (usePeephole)
             {
-                std::cout << "if (usePeephole)" << '\n';
                 gemm(cInternal, blobs[5], 1, gateO, 1, gateO);
                 sigmoid(gateO, gateO);
             }
 
             //compute h_t
             tanh(cInternal, hInternal);
-            std::cout << "o " << gateO << '\n';
-            std::cout << "tanh(o) " << hInternal << '\n';
             multiply(gateO, hInternal, hInternal);
 
             //save results in output blobs
@@ -367,7 +358,6 @@ public:
             if (produceCellOutput)
                 cInternal.copyTo(cOutTs.rowRange(curRowRange));
         }
-        std::cout << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" << '\n';
     }
 };
 
diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
index 6693a75ff4..fe96927840 100644
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@@ -290,30 +290,6 @@ public:
     }
 };
 
-// // To remove Squeeze after LSTM for non-bidirectional LSTM
-// class LSTMSqueeze : public Subgraph
-// {
-// public:
-//     LSTMSqueeze()
-//     {
-//         int input = addNodeToMatch("");
-//
-//         std::vector<int> lstmInps(7);
-//         lstmInps[0] = input;
-//
-//         for (int i = 1; i < 4; ++i)
-//             lstmInps[i] = addNodeToMatch("Unsqueeze");
-//         lstmInps[4] = addNodeToMatch("");
-//         for (int i = 5; i < 7; ++i)
-//             lstmInps[i] = addNodeToMatch("ConstantOfShape");
-//
-//         int lstm = addNodeToMatch("LSTM", lstmInps);
-//         addNodeToMatch("Squeeze", lstm);
-//
-//         setFusedNode("LSTM", lstmInps);
-//     }
-// };
-
 void simplifySubgraphs(opencv_onnx::GraphProto& net)
 {
     std::vector<Ptr<Subgraph> > subgraphs;
@@ -323,7 +299,6 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
     subgraphs.push_back(makePtr<ResizeSubgraph1>());
     subgraphs.push_back(makePtr<ResizeSubgraph2>());
     subgraphs.push_back(makePtr<SoftMaxSubgraph>());
-    // subgraphs.push_back(makePtr<LSTMSqueeze>());
 
     simplifySubgraphs(Ptr<ImportGraphWrapper>(new ONNXGraphWrapper(net)), subgraphs);
 }
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index bcf3d28eed..2bcba9e6ad 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -322,7 +322,7 @@ void ONNXImporter::populateNet(Net dstNet)
 
         std::string layer_type = node_proto.op_type();
         layerParams.type = layer_type;
-        std::cout << layerParams.name << " " << layer_type << '\n';
+
 
         if (layer_type == "MaxPool")
         {
@@ -457,19 +457,6 @@ void ONNXImporter::populateNet(Net dstNet)
                 constBlobs.insert(std::make_pair(layerParams.name, sliced[0]));
                 continue;
             }
-
-            layerParams.set("begin", DictValue::arrayInt(&begin[0], begin.size()));
-            layerParams.set("end", DictValue::arrayInt(&end[0], end.size()));
-
-            CV_Assert(node_proto.input_size() == 1);
-            if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
-            {
-                std::vector<Mat> inputs(1, getBlob(node_proto, constBlobs, 0)), sliced;
-                runLayer(layerParams, inputs, sliced);
-                CV_Assert(sliced.size() == 1);
-                constBlobs.insert(std::make_pair(layerParams.name, sliced[0]));
-                continue;
-            }
         }
         else if (layer_type == "Split")
         {
@@ -592,116 +579,43 @@ void ONNXImporter::populateNet(Net dstNet)
             constBlobs.insert(std::make_pair(layerParams.name, layerParams.blobs[0]));
             continue;
         }
-        else if (layer_type == "ConstantFill" || layer_type == "ConstantOfShape")
-        {
-            CV_Assert_N(node_proto.input_size());
-            MatShape inpShape = getBlob(node_proto, constBlobs, 0);
-            float value = layerParams.get("value", 0);
-            Mat fill(inpShape.size(), &inpShape[0], CV_32F, Scalar(value));
-            constBlobs.insert(std::make_pair(layerParams.name, fill));
-            continue;
-        }
         else if (layer_type == "LSTM")
         {
-            std::cout << "~~~~~~" << '\n';
-            std::cout << layerParams << '\n';
-            for (int i = 1; i < node_proto.input_size(); ++i) {
-              std::cout << "i: " << node_proto.input(i) << " " << constBlobs[node_proto.input(i)].size << '\n';
-            }
-
+            // https://pytorch.org/docs/stable/nn.html#lstm
             CV_Assert(node_proto.input_size() == 7);
             Mat Wx = getBlob(node_proto, constBlobs, 1);
             Mat Wh = getBlob(node_proto, constBlobs, 2);
             Mat b = getBlob(node_proto, constBlobs, 3);
 
+            const int numHidden = Wh.size[2];
 
-            std::cout << Wx.size << '\n';
-            std::cout << Wh.size << '\n';
-
-            int Wx_shape[] = {Wx.size[1], Wx.size[2]};
-            int Wh_shape[] = {Wh.size[1], Wh.size[2]};
-            std::cout << "b.size " <<  b.size << '\n';
-            int b_shape[] = {2, b.size[1] / 2};
-
-            Wx = Wx.reshape(1, 2, &Wx_shape[0]);
-            b = b.reshape(1, 2, &b_shape[0]);
-
-            std::cout << "b ----------------" << '\n';
-
-            std::cout << b << '\n';
+            Wx = Wx.reshape(1, Wx.size[1]);
+            Wh = Wh.reshape(1, Wh.size[1]);
+            b = b.reshape(1, 2);
             reduce(b, b, 0, REDUCE_SUM);
-            std::cout << b << '\n';
-
-            // https://pytorch.org/docs/stable/nn.html#lstm
-            // IFGO->IFOG
-            // swap each 3rd and 4th rows
-            // Wx = Wx.t();
-
-            float* weightData = (float*)Wx.data;
-            std::swap(weightData[1], weightData[2]);
 
+            // IFGO->IGFO
+            float* WxData = (float*)Wx.data;
+            float* WhData = (float*)Wh.data;
             float* biasData = (float*)b.data;
-            std::swap(biasData[1], biasData[2]);
-
-            // std::swap(weightData[2], weightData[3]);
-            //
-            // weightData = (float*)Wh.data;
-            // std::swap(weightData[1], weightData[2]);
-            // std::swap(weightData[2], weightData[3]);
-
-
-            // const int outSize = Wx.cols / 4;
-            // for (int i = 0; i < Wx.rows; ++i)
-            //     for (int j = 0; j < outSize; ++j)
-            //     {
-            //         // std::swap(weightData[i * W.cols + 1 * outSize + j],
-            //         //           weightData[i * W.cols + 2 * outSize + j]);
-            //         std::swap(weightData[i * Wx.cols + 2 * outSize + j],
-            //                   weightData[i * Wx.cols + 3 * outSize + j]);
-            //     }
-
-            // float* weightData = Wx.ptr<float>();
-            // for (int j = 0; j < 5; ++j)
-            // {
-            //     std::cout << "swap " << (10 + j) << " " << (15 + j) << '\n';
-            //     for (int i = 0; i < 12; ++i)
-            //         std::swap(weightData[(10 + j) * 12 + i],
-            //                   weightData[(15 + j) * 12 + i]);
-            // }
-
+            for (int j = 0; j < numHidden; ++j)
+            {
+                for (int i = 0; i < Wx.cols; ++i)
+                {
+                    std::swap(WxData[(numHidden + j) * Wx.cols + i],
+                              WxData[(numHidden * 2 + j) * Wx.cols + i]);
+                }
+                for (int i = 0; i < Wh.cols; ++i)
+                {
+                    std::swap(WhData[(numHidden + j) * Wh.cols + i],
+                              WhData[(numHidden * 2 + j) * Wh.cols + i]);
+                }
+                std::swap(biasData[numHidden + j], biasData[numHidden * 2 + j]);
+            }
             layerParams.blobs.resize(3);
-            layerParams.blobs[0] = Wh.reshape(1, 2, &Wh_shape[0]);
+            layerParams.blobs[0] = Wh;
             layerParams.blobs[1] = Wx;
             layerParams.blobs[2] = b;
-
-            std::cout << "Wx" << '\n';
-            std::cout << layerParams.blobs[1] << '\n';
-
-            std::cout << "Wh" << '\n';
-            std::cout << layerParams.blobs[0] << '\n';
-
-            // layerParams.set("reverse", true);
-
-
-            // layerParams.set("use_peephole", true);
-            // layerParams.blobs.resize(6);
-            // for (int i = 0; i < 3; ++i)
-            // {
-            //     Mat w = Mat::eye(layerParams.blobs[0].cols, layerParams.blobs[0].cols, CV_32F);
-            //     layerParams.blobs[3 + i] = w;
-            // }
-
-            // std::cout << layerParams.blobs[1] << '\n';
-
-            // int lstmId = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
-            //
-            // layerParams = LayerParams();
-            //
-            // // Add reshape
-            // int shape[] = {1, 10, 11, 5};
-            // layerParams.name = node_proto.output(0) + "/reshape";
-            // layerParams.type = "Reshape";
-            // layerParams.set("dim", DictValue::arrayInt(&shape[0], 4));
         }
         else if (layer_type == "ImageScaler")
         {
@@ -1005,14 +919,29 @@ void ONNXImporter::populateNet(Net dstNet)
         else if (layer_type == "Squeeze")
         {
             CV_Assert_N(node_proto.input_size() == 1, layerParams.has("axes"));
-            // DictValue axes_dict = layerParams.get("axes");
-            // if (axes_dict.size() != 1)
-            //     CV_Error(Error::StsNotImplemented, "Multidimensional squeeze");
-            //
-            // int axis = axes_dict.getIntValue(0);
-            // layerParams.set("axis", axis - 1);
-            // layerParams.set("end_axis", axis);
-            layerParams.type = "Identity";
+            DictValue axes_dict = layerParams.get("axes");
+            MatShape inpShape = outShapes[node_proto.input(0)];
+
+            std::vector<bool> maskedAxes(inpShape.size(), false);
+            for (int i = 0; i < axes_dict.size(); ++i)
+            {
+                int axis = axes_dict.getIntValue(i);
+                CV_CheckLE(axis, static_cast<int>(inpShape.size()), "Squeeze axis");
+                maskedAxes[axis] = inpShape[axis] == 1;
+            }
+            MatShape outShape;
+            for (int i = 0; i < inpShape.size(); ++i)
+            {
+                if (!maskedAxes[i])
+                    outShape.push_back(inpShape[i]);
+            }
+            if (outShape.size() != inpShape.size())
+            {
+                layerParams.type = "Reshape";
+                layerParams.set("dim", DictValue::arrayInt(&outShape[0], outShape.size()));
+            }
+            else
+                layerParams.type = "Identity";
         }
         else if (layer_type == "Flatten")
         {
@@ -1142,9 +1071,26 @@ void ONNXImporter::populateNet(Net dstNet)
             else
                 layerParams.type = "Identity";
         }
-        else if (layer_type == "ConstantOfShape")
+        else if (layer_type == "ConstantFill" || layer_type == "ConstantOfShape")
         {
-            float fill_value = layerParams.blobs.empty() ? 0 : layerParams.blobs[0].at<float>(0, 0);
+            CV_Assert_N(node_proto.input_size());
+            MatShape inpShape = getBlob(node_proto, constBlobs, 0);
+            float value = layerParams.get("value", 0);
+            Mat fill(inpShape.size(), &inpShape[0], CV_32F, Scalar(value));
+            constBlobs.insert(std::make_pair(layerParams.name, fill));
+            continue;
+        }
+        else if (layer_type == "ConstantOfShape" || layer_type == "ConstantFill")
+        {
+            float fill_value;
+            if (!layerParams.blobs.empty())
+            {
+                CV_Assert(!layerParams.has("value"));
+                fill_value = layerParams.blobs[0].at<float>(0, 0);
+            }
+            else
+                fill_value = layerParams.get("value", 0);
+
             MatShape inpShape = getBlob(node_proto, constBlobs, 0);
             for (int i = 0; i < inpShape.size(); i++)
                 CV_CheckGT(inpShape[i], 0, "");
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 60ba6d39c5..fe7e47f7a0 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -1826,12 +1826,10 @@ void TFImporter::populateNet(Net dstNet)
             const int outSize = W.cols / 4;
 
             // IGFO->IFOG
-            std::cout << "(TF) W " << W.size << '\n';
             float* weightData = (float*)W.data;
             for (int i = 0; i < W.rows; ++i)
                 for (int j = 0; j < outSize; ++j)
                 {
-                    // std::cout << "swap " << i * W.cols + 1 * outSize << " " << i * W.cols + 2 * outSize << '\n';
                     std::swap(weightData[i * W.cols + 1 * outSize + j],
                               weightData[i * W.cols + 2 * outSize + j]);
                     std::swap(weightData[i * W.cols + 2 * outSize + j],
@@ -1840,11 +1838,6 @@ void TFImporter::populateNet(Net dstNet)
             Wx = W.rowRange(0, W.rows - outSize).t();
             Wh = W.rowRange(W.rows - outSize, W.rows).t();
 
-            std::cout << "(TF) Wx " << Wx.size << '\n';
-            std::cout << "(TF) Wh " << Wh.size << '\n';
-            std::cout << "(TF) b " << b.size << '\n';
-
-
             layerParams.blobs.resize(3);
             layerParams.blobs[0] = Wh;
             layerParams.blobs[1] = Wx;
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index c5b243b8ab..a2cd2c3a68 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -79,12 +79,6 @@ public:
             netSoftmax.setInput(ref);
             ref = netSoftmax.forward();
         }
-        std::cout << "ref: " << ref.size << '\n';
-        std::cout << "out: " << out.size << '\n';
-        std::cout << ref.reshape(1, 1) << '\n';
-        std::cout << '\n';
-        std::cout << out.reshape(1, 1) << '\n';
-
         normAssert(ref, out, "", l1 ? l1 : default_l1, lInf ? lInf : default_lInf);
         if (checkNoFallbacks)
             expectNoFallbacksFromIE(net);

From 11d565ca629d5b36993752941472a26244600e79 Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Wed, 18 Mar 2020 00:00:24 +0300
Subject: [PATCH 3/5] Fix LSTM from ONNX with batch==1

---
 modules/dnn/src/layers/recurrent_layers.cpp |  9 +-
 modules/dnn/src/onnx/onnx_importer.cpp      | 97 ++++++++++++++-------
 2 files changed, 69 insertions(+), 37 deletions(-)

diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index 3f9a229516..26d2ea9de5 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -110,10 +110,11 @@ public:
             const Mat& Wh = blobs[0];
             const Mat& Wx = blobs[1];
             const Mat& bias = blobs[2];
-            CV_Assert(Wh.dims == 2 && Wx.dims == 2);
-            CV_Assert(Wh.rows == Wx.rows);
-            CV_Assert(Wh.rows == 4*Wh.cols);
-            CV_Assert(Wh.rows == (int)bias.total());
+            CV_CheckEQ(Wh.dims, 2, "");
+            CV_CheckEQ(Wx.dims, 2, "");
+            CV_CheckEQ(Wh.rows, Wx.rows, "");
+            CV_CheckEQ(Wh.rows, 4*Wh.cols, "");
+            CV_CheckEQ(Wh.rows, (int)bias.total(), "");
             CV_Assert(Wh.type() == Wx.type() && Wx.type() == bias.type());
 
             // Peephole weights.
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 2bcba9e6ad..b243a986e7 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -49,6 +49,11 @@ class ONNXImporter
     LayerParams getLayerParams(const opencv_onnx::NodeProto& node_proto);
     bool isCeilMode(const LayerParams& layerParams);
 
+    void addLayer(Net& dstNet, LayerParams& layerParams,
+                  const opencv_onnx::NodeProto& node_proto,
+                  std::map<std::string, LayerInfo>& layer_id,
+                  std::map<std::string, MatShape>& outShapes);
+
 public:
 
     ONNXImporter(const char *onnxFile)
@@ -259,6 +264,42 @@ Mat ONNXImporter::getBlob(const opencv_onnx::NodeProto& node_proto,
     return constBlob->second;
 }
 
+void ONNXImporter::addLayer(Net& dstNet, LayerParams& layerParams,
+                            const opencv_onnx::NodeProto& node_proto,
+                            std::map<std::string, LayerInfo>& layer_id,
+                            std::map<std::string, MatShape>& outShapes)
+{
+    std::map<std::string, LayerInfo>::iterator layerId;
+    std::map<std::string, MatShape>::iterator shapeIt;
+
+    int id = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
+    for (int i = 0; i < node_proto.output_size(); ++i)
+    {
+        layer_id.insert(std::make_pair(node_proto.output(i), LayerInfo(id, i)));
+    }
+
+    std::vector<MatShape> layerInpShapes, layerOutShapes, layerInternalShapes;
+    int inpNum = 0;
+    for (int j = 0; j < node_proto.input_size(); j++) {
+        layerId = layer_id.find(node_proto.input(j));
+        if (layerId != layer_id.end()) {
+            dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, inpNum);
+            ++inpNum;
+            // Collect input shapes.
+            shapeIt = outShapes.find(node_proto.input(j));
+            CV_Assert(shapeIt != outShapes.end());
+            layerInpShapes.push_back(shapeIt->second);
+        }
+    }
+    // Compute shape of output blob for this layer.
+    Ptr<Layer> layer = dstNet.getLayer(id);
+    layer->getMemoryShapes(layerInpShapes, 0, layerOutShapes, layerInternalShapes);
+    for (int i = 0; i < node_proto.output_size() && i < (int)layerOutShapes.size(); ++i)
+    {
+        outShapes[node_proto.output(i)] = layerOutShapes[i];
+    }
+}
+
 void ONNXImporter::populateNet(Net dstNet)
 {
     CV_Assert(model_proto.has_graph());
@@ -581,13 +622,16 @@ void ONNXImporter::populateNet(Net dstNet)
         }
         else if (layer_type == "LSTM")
         {
+            LayerParams lstmParams = layerParams;
+            lstmParams.name += "/lstm";
+
             // https://pytorch.org/docs/stable/nn.html#lstm
             CV_Assert(node_proto.input_size() == 7);
             Mat Wx = getBlob(node_proto, constBlobs, 1);
             Mat Wh = getBlob(node_proto, constBlobs, 2);
             Mat b = getBlob(node_proto, constBlobs, 3);
 
-            const int numHidden = Wh.size[2];
+            const int numHidden = lstmParams.get<int>("hidden_size");
 
             Wx = Wx.reshape(1, Wx.size[1]);
             Wh = Wh.reshape(1, Wh.size[1]);
@@ -612,10 +656,24 @@ void ONNXImporter::populateNet(Net dstNet)
                 }
                 std::swap(biasData[numHidden + j], biasData[numHidden * 2 + j]);
             }
-            layerParams.blobs.resize(3);
-            layerParams.blobs[0] = Wh;
-            layerParams.blobs[1] = Wx;
-            layerParams.blobs[2] = b;
+
+            lstmParams.blobs.resize(3);
+            lstmParams.blobs[0] = Wh;
+            lstmParams.blobs[1] = Wx;
+            lstmParams.blobs[2] = b;
+
+            node_proto.set_output(0, lstmParams.name);  // set different name so output shapes will be registered on that name
+            addLayer(dstNet, lstmParams, node_proto, layer_id, outShapes);
+
+            MatShape lstmShape = outShapes[node_proto.output(0)];
+
+            // Add fake 1 as it is done in ONNX
+            lstmShape.insert(lstmShape.begin() + 1, 1);
+
+            layerParams.type = "Reshape";
+            layerParams.set("dim", DictValue::arrayInt(&lstmShape[0], lstmShape.size()));
+            node_proto.set_input(0, lstmParams.name);  // redirect input to LSTM
+            node_proto.set_output(0, layerParams.name);  // keep origin LSTM's name
         }
         else if (layer_type == "ImageScaler")
         {
@@ -1228,34 +1286,7 @@ void ONNXImporter::populateNet(Net dstNet)
                     layerParams.blobs.push_back(getBlob(node_proto, constBlobs, j));
             }
         }
-
-        int id = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
-        for (int i = 0; i < node_proto.output_size(); ++i)
-        {
-            layer_id.insert(std::make_pair(node_proto.output(i), LayerInfo(id, i)));
-        }
-
-        std::vector<MatShape> layerInpShapes, layerOutShapes, layerInternalShapes;
-        int inpNum = 0;
-        for (int j = 0; j < node_proto.input_size(); j++) {
-            layerId = layer_id.find(node_proto.input(j));
-            if (layerId != layer_id.end()) {
-                dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, inpNum);
-                ++inpNum;
-                // Collect input shapes.
-                shapeIt = outShapes.find(node_proto.input(j));
-                CV_Assert(shapeIt != outShapes.end());
-                layerInpShapes.push_back(shapeIt->second);
-            }
-        }
-
-        // Compute shape of output blob for this layer.
-        Ptr<Layer> layer = dstNet.getLayer(id);
-        layer->getMemoryShapes(layerInpShapes, 0, layerOutShapes, layerInternalShapes);
-        for (int i = 0; i < node_proto.output_size() && i < (int)layerOutShapes.size(); ++i)
-        {
-            outShapes[node_proto.output(i)] = layerOutShapes[i];
-        }
+        addLayer(dstNet, layerParams, node_proto, layer_id, outShapes);
     }
 }
 

From 8433620295891c184ce4edd86bbd5ad6440eda45 Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Sun, 22 Mar 2020 00:20:36 +0300
Subject: [PATCH 4/5] Bidirectional LSTM

---
 modules/dnn/src/layers/recurrent_layers.cpp | 162 +++++++++++---------
 modules/dnn/src/onnx/onnx_importer.cpp      |  43 +++---
 modules/dnn/test/test_onnx_importer.cpp     |   5 +
 3 files changed, 116 insertions(+), 94 deletions(-)

diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index 26d2ea9de5..69606a6b4e 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -93,6 +93,7 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
     float forgetBias, cellClip;
     bool useCellClip, usePeephole;
     bool reverse;   // If true, go in negative direction along the time axis
+    bool bidirectional;  // If true, produces both forward and reversed directions along time axis
 
 public:
 
@@ -101,6 +102,7 @@ public:
     {
         setParamsFrom(params);
 
+        bidirectional = params.get<bool>("bidirectional", false);
         if (!blobs.empty())
         {
             CV_Assert(blobs.size() >= 3);
@@ -113,7 +115,7 @@ public:
             CV_CheckEQ(Wh.dims, 2, "");
             CV_CheckEQ(Wx.dims, 2, "");
             CV_CheckEQ(Wh.rows, Wx.rows, "");
-            CV_CheckEQ(Wh.rows, 4*Wh.cols, "");
+            CV_CheckEQ(Wh.rows, (1 + static_cast<int>(bidirectional))*4*Wh.cols, "");
             CV_CheckEQ(Wh.rows, (int)bias.total(), "");
             CV_Assert(Wh.type() == Wx.type() && Wx.type() == bias.type());
 
@@ -136,6 +138,7 @@ public:
         useCellClip = params.get<bool>("use_cell_clip", false);
         usePeephole = params.get<bool>("use_peephole", false);
         reverse = params.get<bool>("reverse", false);
+        CV_Assert(!reverse || !bidirectional);
 
         allocated = false;
         outTailShape.clear();
@@ -207,6 +210,7 @@ public:
 
         outResShape.push_back(_numSamples);
         outResShape.insert(outResShape.end(), outTailShape_.begin(), outTailShape_.end());
+        outResShape.back() *= (1 + static_cast<int>(bidirectional));
 
         size_t noutputs = produceCellOutput ? 2 : 1;
         outputs.assign(noutputs, outResShape);
@@ -253,6 +257,7 @@ public:
         outTsShape.clear();
         outTsShape.push_back(numSamples);
         outTsShape.insert(outTsShape.end(), outTailShape.begin(), outTailShape.end());
+        outTsShape.back() *= (1 + static_cast<int>(bidirectional));
 
         allocated = true;
     }
@@ -273,91 +278,96 @@ public:
         outputs_arr.getMatVector(output);
         internals_arr.getMatVector(internals);
 
-        const Mat &Wh = blobs[0];
-        const Mat &Wx = blobs[1];
-        const Mat &bias = blobs[2];
-
-        int numOut = Wh.size[1];
-
-        Mat hInternal = internals[0], cInternal = internals[1],
-                dummyOnes = internals[2], gates = internals[3];
-        hInternal.setTo(0.);
-        cInternal.setTo(0.);
-        dummyOnes.setTo(1.);
-
-        int numSamplesTotal = numTimeStamps*numSamples;
-        Mat xTs = input[0].reshape(1, numSamplesTotal);
-
-        Mat hOutTs = output[0].reshape(1, numSamplesTotal);
-        Mat cOutTs = produceCellOutput ? output[1].reshape(1, numSamplesTotal) : Mat();
-
-        int tsStart, tsEnd, tsInc;
-        if (reverse) {
-            tsStart = numTimeStamps - 1;
-            tsEnd = -1;
-            tsInc = -1;
-        }
-        else {
-            tsStart = 0;
-            tsEnd = numTimeStamps;
-            tsInc = 1;
-        }
-        for (int ts = tsStart; ts != tsEnd; ts += tsInc)
+        const int numDirs = 1 + static_cast<int>(bidirectional);
+        for (int i = 0; i < numDirs; ++i)
         {
-            Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
-            Mat xCurr = xTs.rowRange(curRowRange);
+            const Mat &Wh = blobs[0].rowRange(i * blobs[0].rows / numDirs, (i + 1) * blobs[0].rows / numDirs);
+            const Mat &Wx = blobs[1].rowRange(i * blobs[1].rows / numDirs, (i + 1) * blobs[1].rows / numDirs);
+            const Mat &bias = blobs[2].colRange(i * blobs[2].cols / numDirs, (i + 1) * blobs[2].cols / numDirs);
 
-            gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T);      // Wx * x_t
-            gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T);  //+Wh * h_{t-1}
-            gemm(dummyOnes, bias, 1, gates, 1, gates);          //+b
+            int numOut = Wh.size[1];
 
-            Mat gateI = gates.colRange(0*numOut, 1*numOut);
-            Mat gateF = gates.colRange(1*numOut, 2*numOut);
-            Mat gateO = gates.colRange(2*numOut, 3*numOut);
-            Mat gateG = gates.colRange(3*numOut, 4*numOut);
+            Mat hInternal = internals[0], cInternal = internals[1],
+                    dummyOnes = internals[2], gates = internals[3];
+            hInternal.setTo(0.);
+            cInternal.setTo(0.);
+            dummyOnes.setTo(1.);
 
-            if (forgetBias)
-                add(gateF, forgetBias, gateF);
+            int numSamplesTotal = numTimeStamps*numSamples;
+            Mat xTs = input[0].reshape(1, numSamplesTotal);
 
-            if (usePeephole)
-            {
-                Mat gatesIF = gates.colRange(0, 2*numOut);
-                gemm(cInternal, blobs[3], 1, gateI, 1, gateI);
-                gemm(cInternal, blobs[4], 1, gateF, 1, gateF);
-                sigmoid(gatesIF, gatesIF);
+            Mat hOutTs = output[0].reshape(1, numSamplesTotal);
+            hOutTs = hOutTs.colRange(i * hOutTs.cols / numDirs, (i + 1) * hOutTs.cols / numDirs);
+            Mat cOutTs = produceCellOutput ? output[1].reshape(1, numSamplesTotal) : Mat();
+
+            int tsStart, tsEnd, tsInc;
+            if (reverse || i == 1) {
+                tsStart = numTimeStamps - 1;
+                tsEnd = -1;
+                tsInc = -1;
             }
-            else
-            {
-                Mat gatesIFO = gates.colRange(0, 3*numOut);
-                sigmoid(gatesIFO, gatesIFO);
+            else {
+                tsStart = 0;
+                tsEnd = numTimeStamps;
+                tsInc = 1;
             }
-
-            tanh(gateG, gateG);
-
-            //compute c_t
-            multiply(gateF, cInternal, gateF);  // f_t (*) c_{t-1}
-            multiply(gateI, gateG, gateI);      // i_t (*) g_t
-            add(gateF, gateI, cInternal);       // c_t = f_t (*) c_{t-1} + i_t (*) g_t
-
-            if (useCellClip)
+            for (int ts = tsStart; ts != tsEnd; ts += tsInc)
             {
-                min(cInternal, cellClip, cInternal);
-                max(cInternal, -cellClip, cInternal);
-            }
-            if (usePeephole)
-            {
-                gemm(cInternal, blobs[5], 1, gateO, 1, gateO);
-                sigmoid(gateO, gateO);
-            }
+                Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
+                Mat xCurr = xTs.rowRange(curRowRange);
 
-            //compute h_t
-            tanh(cInternal, hInternal);
-            multiply(gateO, hInternal, hInternal);
+                gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T);      // Wx * x_t
+                gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T);  //+Wh * h_{t-1}
+                gemm(dummyOnes, bias, 1, gates, 1, gates);          //+b
 
-            //save results in output blobs
-            hInternal.copyTo(hOutTs.rowRange(curRowRange));
-            if (produceCellOutput)
-                cInternal.copyTo(cOutTs.rowRange(curRowRange));
+                Mat gateI = gates.colRange(0*numOut, 1*numOut);
+                Mat gateF = gates.colRange(1*numOut, 2*numOut);
+                Mat gateO = gates.colRange(2*numOut, 3*numOut);
+                Mat gateG = gates.colRange(3*numOut, 4*numOut);
+
+                if (forgetBias)
+                    add(gateF, forgetBias, gateF);
+
+                if (usePeephole)
+                {
+                    Mat gatesIF = gates.colRange(0, 2*numOut);
+                    gemm(cInternal, blobs[3], 1, gateI, 1, gateI);
+                    gemm(cInternal, blobs[4], 1, gateF, 1, gateF);
+                    sigmoid(gatesIF, gatesIF);
+                }
+                else
+                {
+                    Mat gatesIFO = gates.colRange(0, 3*numOut);
+                    sigmoid(gatesIFO, gatesIFO);
+                }
+
+                tanh(gateG, gateG);
+
+                //compute c_t
+                multiply(gateF, cInternal, gateF);  // f_t (*) c_{t-1}
+                multiply(gateI, gateG, gateI);      // i_t (*) g_t
+                add(gateF, gateI, cInternal);       // c_t = f_t (*) c_{t-1} + i_t (*) g_t
+
+                if (useCellClip)
+                {
+                    min(cInternal, cellClip, cInternal);
+                    max(cInternal, -cellClip, cInternal);
+                }
+                if (usePeephole)
+                {
+                    gemm(cInternal, blobs[5], 1, gateO, 1, gateO);
+                    sigmoid(gateO, gateO);
+                }
+
+                //compute h_t
+                tanh(cInternal, hInternal);
+                multiply(gateO, hInternal, hInternal);
+
+                //save results in output blobs
+                hInternal.copyTo(hOutTs.rowRange(curRowRange));
+                if (produceCellOutput)
+                    cInternal.copyTo(cOutTs.rowRange(curRowRange));
+            }
         }
     }
 };
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index b243a986e7..79386e6615 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -630,37 +630,44 @@ void ONNXImporter::populateNet(Net dstNet)
             Mat Wx = getBlob(node_proto, constBlobs, 1);
             Mat Wh = getBlob(node_proto, constBlobs, 2);
             Mat b = getBlob(node_proto, constBlobs, 3);
+            b = b.reshape(1, b.size[0]);
 
             const int numHidden = lstmParams.get<int>("hidden_size");
-
-            Wx = Wx.reshape(1, Wx.size[1]);
-            Wh = Wh.reshape(1, Wh.size[1]);
-            b = b.reshape(1, 2);
-            reduce(b, b, 0, REDUCE_SUM);
+            const int numDirs = Wx.size[0];  // Is 1 for forward only and 2 for bidirectional LSTM.
+            const int numFeatures = Wx.size[2];
+            Mat bx = b.colRange(0, b.cols / 2);
+            Mat bh = b.colRange(b.cols / 2, b.cols);
+            b = bx + bh;
 
             // IFGO->IGFO
-            float* WxData = (float*)Wx.data;
-            float* WhData = (float*)Wh.data;
-            float* biasData = (float*)b.data;
-            for (int j = 0; j < numHidden; ++j)
+            for (int k = 0; k < numDirs; ++k)
             {
-                for (int i = 0; i < Wx.cols; ++i)
+                float* WxData = Wx.ptr<float>(k);
+                float* WhData = Wh.ptr<float>(k);
+                float* biasData = b.ptr<float>(k);
+                for (int j = 0; j < numHidden; ++j)
                 {
-                    std::swap(WxData[(numHidden + j) * Wx.cols + i],
-                              WxData[(numHidden * 2 + j) * Wx.cols + i]);
+                    for (int i = 0; i < numFeatures; ++i)
+                    {
+                        std::swap(WxData[(numHidden + j) * numFeatures + i],
+                                  WxData[(numHidden * 2 + j) * numFeatures + i]);
+                    }
+                    for (int i = 0; i < numHidden; ++i)
+                    {
+                        std::swap(WhData[(numHidden + j) * numHidden + i],
+                                  WhData[(numHidden * 2 + j) * numHidden + i]);
+                    }
+                    std::swap(biasData[numHidden + j], biasData[numHidden * 2 + j]);
                 }
-                for (int i = 0; i < Wh.cols; ++i)
-                {
-                    std::swap(WhData[(numHidden + j) * Wh.cols + i],
-                              WhData[(numHidden * 2 + j) * Wh.cols + i]);
-                }
-                std::swap(biasData[numHidden + j], biasData[numHidden * 2 + j]);
             }
+            Wx = Wx.reshape(1, Wx.size[0] * Wx.size[1]);
+            Wh = Wh.reshape(1, Wh.size[0] * Wh.size[1]);
 
             lstmParams.blobs.resize(3);
             lstmParams.blobs[0] = Wh;
             lstmParams.blobs[1] = Wx;
             lstmParams.blobs[2] = b;
+            lstmParams.set("bidirectional", lstmParams.get<String>("direction", "") == "bidirectional");
 
             node_proto.set_output(0, lstmParams.name);  // set different name so output shapes will be registered on that name
             addLayer(dstNet, lstmParams, node_proto, layer_id, outShapes);
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index a2cd2c3a68..f741319959 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -456,6 +456,11 @@ TEST_P(Test_ONNX_layers, LSTM)
     testONNXModels("lstm");
 }
 
+TEST_P(Test_ONNX_layers, LSTM_bidirectional)
+{
+    testONNXModels("lstm_bidirectional");
+}
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_ONNX_layers, dnnBackendsAndTargets());
 
 class Test_ONNX_nets : public Test_ONNX_layers

From 467c3ef0ac621b2cbc296bbabe286bc9cc476696 Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Sun, 22 Mar 2020 16:04:30 +0300
Subject: [PATCH 5/5] Add checks for LSTM initial h and c

---
 modules/dnn/src/onnx/onnx_importer.cpp  | 22 +++++++++++++---------
 modules/dnn/test/test_onnx_importer.cpp |  6 ++++--
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 79386e6615..47b5aff674 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -496,6 +496,7 @@ void ONNXImporter::populateNet(Net dstNet)
                 runLayer(layerParams, inputs, sliced);
                 CV_Assert(sliced.size() == 1);
                 constBlobs.insert(std::make_pair(layerParams.name, sliced[0]));
+                outShapes[layerParams.name] = shape(sliced[0]);
                 continue;
             }
         }
@@ -630,6 +631,8 @@ void ONNXImporter::populateNet(Net dstNet)
             Mat Wx = getBlob(node_proto, constBlobs, 1);
             Mat Wh = getBlob(node_proto, constBlobs, 2);
             Mat b = getBlob(node_proto, constBlobs, 3);
+            CV_CheckEQ(countNonZero(getBlob(node_proto, constBlobs, 5)), 0, "Unsupported non zero initial_h");
+            CV_CheckEQ(countNonZero(getBlob(node_proto, constBlobs, 6)), 0, "Unsupported non zero initial_c");
             b = b.reshape(1, b.size[0]);
 
             const int numHidden = lstmParams.get<int>("hidden_size");
@@ -1007,6 +1010,16 @@ void ONNXImporter::populateNet(Net dstNet)
             }
             else
                 layerParams.type = "Identity";
+
+            if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
+            {
+                Mat inp = getBlob(node_proto, constBlobs, 0);
+                Mat out = inp.reshape(1, outShape);
+                out.dims = outShape.size();  // to workaround dims == 1
+                constBlobs.insert(std::make_pair(layerParams.name, out));
+                outShapes[layerParams.name] = shape(out);
+                continue;
+            }
         }
         else if (layer_type == "Flatten")
         {
@@ -1136,15 +1149,6 @@ void ONNXImporter::populateNet(Net dstNet)
             else
                 layerParams.type = "Identity";
         }
-        else if (layer_type == "ConstantFill" || layer_type == "ConstantOfShape")
-        {
-            CV_Assert_N(node_proto.input_size());
-            MatShape inpShape = getBlob(node_proto, constBlobs, 0);
-            float value = layerParams.get("value", 0);
-            Mat fill(inpShape.size(), &inpShape[0], CV_32F, Scalar(value));
-            constBlobs.insert(std::make_pair(layerParams.name, fill));
-            continue;
-        }
         else if (layer_type == "ConstantOfShape" || layer_type == "ConstantFill")
         {
             float fill_value;
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index f741319959..6932e83a4e 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -405,6 +405,8 @@ TEST_P(Test_ONNX_layers, Reshape)
 
 TEST_P(Test_ONNX_layers, Squeeze)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     testONNXModels("squeeze");
 }
 
@@ -453,12 +455,12 @@ TEST_P(Test_ONNX_layers, Split_EltwiseMax)
 
 TEST_P(Test_ONNX_layers, LSTM)
 {
-    testONNXModels("lstm");
+    testONNXModels("lstm", npy, 0, 0, false, false);
 }
 
 TEST_P(Test_ONNX_layers, LSTM_bidirectional)
 {
-    testONNXModels("lstm_bidirectional");
+    testONNXModels("lstm_bidirectional", npy, 0, 0, false, false);
 }
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_ONNX_layers, dnnBackendsAndTargets());