4

After upgrading from boost 1.67.0 to boost 1.75.0, we're having problems with boost beast's HTTP request library.

I have the following code which communicates with the Kubernetes API server to store a custom resource via the REST interface using boost beast:

#include <string>
#include <iostream>
#include <sstream>
#include <fstream> 
#include <boost/beast/core.hpp>
#include <boost/beast/version.hpp>
#include <boost/beast/http.hpp>
#include <boost/asio/connect.hpp>
#include <boost/asio/ip/tcp.hpp>
#include <boost/asio/ssl/stream.hpp> 
#include <boost/asio/ssl/error.hpp>
#include <boost/property_tree/ptree.hpp>
#include <boost/property_tree/json_parser.hpp>


namespace bip = boost::asio::ip;
namespace bhttp = boost::beast::http;
namespace ssl = boost::asio::ssl;

void postServiceEndpoint(std::string topicName, std::string url, 
                         std::string host, std::string port, std::string discoveryNamespace)
{
  boost::asio::io_context context;
  boost::asio::ip::tcp::resolver resolver(context);  
  ssl::context sslCtx({ssl::context::sslv23_client});
  boost::asio::ssl::stream<boost::beast::tcp_stream> sslStream(context, sslCtx);
  
 
  auto const results = resolver.resolve(host, port);
  SSL_set_tlsext_host_name(sslStream.native_handle(), host.c_str());
  boost::beast::get_lowest_layer(sslStream).connect(results);
  sslStream.handshake(ssl::stream_base::client);


  //Load the bearer token for authenticating with K8s... 
  std::ifstream t("/var/run/secrets/kubernetes.io/serviceaccount/token");
  std::string str((std::istreambuf_iterator<char>(t)), 
                  std::istreambuf_iterator<char>());
  std::string bearerToken = str; 

  std::string target = "/apis/sdsendpoints.net/v1/namespaces/" + discoveryNamespace + "sdsendpoints"; 
  //Because the endpoint hasn't been created yet, we cant use it in the target
  //string, but if we want to retrieve the endpoint later, we have to use its name
  //in the target string... Kubernetes's REST API be weird like that. 
  bhttp::request<bhttp::string_body> request(bhttp::verb::post, target, HTTPV1DOT1);
  std::cout << "DEBUG: Set host field. " << std::endl;
  request.set(bhttp::field::host, host);
  std::cout << "DEBUG: Set content type." << std::endl;
  request.set("Content-Type", "application/json");
  std::cout << "DEBUG: Set bearer token." << std::endl;
  request.set("Authorization", "Bearer " + bearerToken);
   
  boost::property_tree::ptree requestTree;
  requestTree.put("apiVersion", "sdsendpoints.net/v1");
  requestTree.put("kind", "SdsEndpoint");
  requestTree.put("metadata.name", topicName);
  requestTree.put("spec.endpointURL", url);
      
  std::stringstream jsonStream;
  boost::property_tree::write_json(jsonStream, requestTree);
  request.body() = jsonStream.str();
  request.prepare_payload();
  std::cout << "REQUEST: \n" << request << std::endl;
  bhttp::write(sslStream, request);
  boost::beast::flat_buffer buffer;
  bhttp::response<bhttp::string_body> response;
  bhttp::read(sslStream, buffer, response);         
  if(response.result_int() >= 400)
  {
       std::cout << "Got failure on post endpoint: " << response.result_int() << ": " << response.result() << " : " << response.body() << std::endl;
  }
  //Cleanup the SSL socket...
  boost::system::error_code ec;
  sslStream.shutdown(ec);
  if(ec == boost::asio::error::eof)
  {
    //This is fine. I am okay with the events that are unfolding currently.
    ec.assign(0, ec.category());
  }
  if(ec)
  {
    std::cout << "Got error code: " << ec << " on socket cleanup in SSL shutdown" << std::endl;
  }
  sslStream.lowest_layer().shutdown(boost::asio::ip::tcp::socket::shutdown_both, ec);
  if(ec)
  { 
    std::cout << "Got error code: " << ec << " on socket cleanup in TCP socket shutdown." << std::endl;
  }

}

This code worked absolutely fine in boost 1.67.0, but upon upgrading to boost 1.75.0, we immediately found that the code would hang when the boost::beast request has the set() method called.

However, intriguingly, this behavior only occurs when the code is run inside of a container as part of a Kubernetes app.

Looking at our debug printouts, we only get to the "DEBUG: Set host field printout." When running the code outside of the container environment on the physical host machines, it runs fine and is able to successfully communicate across the network.

Originally, the container used the RHEL UBI minimal image 7.9. As part of our debugging efforts, we rebuilt it using the standard UBI image and found the same freezing behavior. Installing gdb into the container image and shelling into the running container to attach to the binary with the code reveals the following stack trace:

(gdb) bt
#0 0x0000000000ae0826 in name_string (this=<optimized out>) at /opt/common/boost/1.75.0/include/boost/beast/http/impl/fields.hpp:326
#1 operator() (this=<optimized out>, lhs=..., rhs=...) at /opt/common/boost/1.75.0/include/boost/beast/http/fields.hpp:140
#2 operator() (this=<optimized out>, key1=..., nonkey2=...) at /opt/common/boost/1.75.0/include/boost/intrusive/detail/tree_value_compare.hpp:175
#3 operator() (this=<optimized out>, t1=<optimized out>, t2=...) at /opt/common/boost/1.75.0/include/boost/intrusive/detail/key_nodeptr_comp.hpp:175
#4 lower_bound_loop (x=<optimized out>, y=<optimized out>, key=..., comp=...) at /opt/common/boost/include/intrusive/bstree_algorithms:2027
#5 lower_bound (header=<optimized out>, key=..., comp=...) at /opt/common/boost/1.75.0/include/boost/intrusive/bstree_algorithms.hpp:918
#6 lower_bound (this=<optimized out>, key=..., comp=...) at /opt/common/boost/1.75.0/include/boost/intrusive/bstree.hpp:333
#7 boost::beast::http::basic_fields<std::allocator<char> >::set_element (this=0x7fffec920490, e=...) at /opt/common/boost/1.75.0/include/boost/beast/http/impl/fields.hpp:1005
#8 0x00000000016a8cbc in postServiceEndpoint(std::string, std::string) ()

As far as we can tell from the gdb output, the boost::beast code is just sitting in a dead loop for some reason. The strangest part is that the code appears to be very innocuous as it's just setting some fields inside the class. There's no network code or any kind of weird multithreading that could lead to a race condition involved.

The code was compiled with intel icc 19.1.0.166 20191121 for both boost 1.67.0 and boost 1.75.0. The host environment is RHEL 7.9 (Maipo) as well, so the container and host are pretty much the same OS. Our container engine is CRIO 1.18.4 and we're running kubernetes 1.21 (not that we think k8s has anything to do with the problem).

The only thing that has changed is the upgrading of boost versions, and if we revert the boost version the code runs fine again. However, because boost::json is only available starting in boost 1.75, we need to upgrade, so simply hanging back on the version isn't really an option for us.

Has something in the API changed that might make the request's set() method hang like this?

Edit (curiouser and curiouser): We broke the code down into an even more minimal example that only calls the boost beast set method, and found that it does run inside a container, so we went back to the full code (just the function) and found that, by itself, also appears to run in a container when used in a minimal unit test. The problem appears to come with the full application itself. The only difference is that the full app is multithreaded, but even so, this code is only being used in a single thread, so it makes no sense for there to be a deadlock hazard given that fact, especially since it worked fine in 1.67.0.

Edit #2: We were able to find a workaround that fixed the problem by rearranging some code, but it's unclear as to why this fixed the issue.

The gist: The full REST code is in a C++ class which implements an interface. I noticed that the code which ultimately used this class accepted a reference, rather than a shared_ptr, something like the following:

public class IServiceEndpointFinder 
{ 
   virtual std::string getEndpoint(std::string topic) = 0; 
   virtual void postEndpoint(std::string topic, std::string url) = 0; 
}; 

public class KubernetesEndpointFinder : public IServiceEndpointFinder
{
//All the methods you'd expect from the previous code. 
}; 

public class ThingThatUsesEndpointFinder { 
    protected:
    ThingThatUsesEndpointFinder(IServiceEndpointFinder& discovery)
    {
       //Do some stuff
       discovery.postEndpoint(endpointTopic, endpointURL); 
    }

};

//... Some code
boost::shared_ptr<IServiceEndpointFinder> discovery(new KubernetesEndpointFinder(...);
ThingThatUsesEndpointFinder user(*discovery); 

To be honest, when I noticed this code, I was surprised to find it even compiled. Long story short, it appeared to be an oversight when we created the endpoint finder interface (previously it had been a single class that used a custom microservice rather than the K8s API).

When I changed the above to use a shared_ptr, rather than a reference, the code no longer froze. Considering the correct code appeared to be getting invoked whether it was a reference or a shared_ptr, I'm not sure why this would have an effect.

At the same time, it was decided to use the http request's set method for the host as set("Host", ...) rather than the built-in bhttp::host enum.

These are the only two changes we made to the code, and either of them alone or together seems unlikely to have fixed the issue. We've exercised the code pretty heavily since it stopped freezing up, so it seems stable. We're at a loss to explain the problem or why these relatively simple changes seemed to have fixed it.

stix
  • 1,140
  • 13
  • 36
  • "only occurs when the code is run inside of a container as part of a Kubernetes app.", but you also say that it's code that specifically interacts with the Kubernetes API, so I'm a bit curious as to what the "working" setup entails. Is it running via `docker run`? Is it contacting a mock API? –  May 27 '21 at 21:35
  • @Frank When we run it outside of the container, it still contacts the same K8s api. Only the port and host it dials up are changed. We've set up a mock bearer token on the host machine in the appropriate location, but it's an identical token to what k8s provides when run in a pod. There's no docker or containerization involved in the working setup. The k8s api server exposes itself externally at port 6443 vice 443. However, all of that is moot as the code hangs before any real network interaction happens (other than the initial SSL connection), and hangs in non-network code. – stix May 27 '21 at 21:39
  • have you tried to narrow down on which version of boost the issue start to appear? could you do that? Maybe, you will get a clue on what it has changed, this is the release changes list. https://www.boost.org/doc/libs/1_76_0/libs/beast/doc/html/beast/release_notes.html – Alessandro Teruzzi Jun 04 '21 at 11:12
  • the file in your stacktrace it seems to appear in version 1.70 https://github.com/boostorg/beast/blob/boost-1.70.0/include/boost/beast/http/impl/fields.hpp – Alessandro Teruzzi Jun 04 '21 at 11:37
  • @AlessandroTeruzzi I built the 1.75.0 from source with two different methods (using our internal Conan process and by hand). We don't even have a copy of 1.70.0 source. It's definitely 1.75.0. – stix Jun 04 '21 at 17:27
  • I am sure it is 1.75, I was just pointing out that the file in your stack trace is doesn't exist in 1.67. I can see some infinite loop in it (only breaking when some condition is met), obviously only a speculation, are you sure the other threads is not modifing the same data? – Alessandro Teruzzi Jun 04 '21 at 17:41
  • @AlessandroTeruzzi Yes we're positive on that. The other threads go into a zeromq poll loop that just sits waiting for data on its sockets. This code is relatively early on in the app and most things are still initializing at that point. We've been able to get a workaround (question updated), but we're still at a loss to explain the behavior in the first place as the changes were minor at best. – stix Jun 04 '21 at 17:45
  • 2
    It is not uncommon to use a reference (usually const reference), but in that case the life time of the object must be guarantee by the caller. Very likely you had undefined behaviour using a reference of an object that went out of scope. Using the shared_ptr prevent the object to be destroyed. – Alessandro Teruzzi Jun 04 '21 at 22:06
  • I'm with @AlessandroTeruzzi's analysis. In short, the problem was indeed UB outside the code (originally) shown. – sehe Jun 07 '21 at 12:25

1 Answers1

3

We broke the code down into an even more minimal example that only calls the boost beast set method, and found that it does run inside a container, so we went back to the full code (just the function) and found that, by itself, also appears to run in a container when used in a minimal unit test. The problem appears to come with the full application itself.

This usually spells Undefined Behaviour. If some part of the application corrupts memory (stack or heap) this can have completely unrelated and unpredictable consequences anywhere.

The only difference is that the full app is multithreaded, but even so, this code is only being used in a single thread, so it makes no sense for there to be a deadlock hazard given that fact, especially since it worked fine in 1.67.0.

That's the nature of UB: it doesn't need to make sense. It may not even be a deadlock (soft locks happen when state is corrupted).

I strongly suggest you enable ASAN/UBSAN to scout for errors. If you can, use e.g. valgrind for alternative angles (not all UB is detected by all sanitizers, because of the halting problem).


FWIW I just tried your code on a CentOS7 container:

[root@dbfd1cc8688d stackoverflow]# cat /etc/centos-release
CentOS Linux release 7.9.2009 (Core)

With GCC:

[root@dbfd1cc8688d stackoverflow]# g++ -v
Using built-in specs.
COLLECT_GCC=g++
COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-redhat-linux/4.8.5/lto-wrapper
Target: x86_64-redhat-linux
Configured with: ../configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --with-bugurl=http://bugzilla.redhat.com/bugzilla --enable-bootstrap --enable-shared --enable-threads=posix --enable-checking=release --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable-gnu-unique-object --enable-linker-build-id --with-linker-hash-style=gnu --enable-languages=c,c++,objc,obj-c++,java,fortran,ada,go,lto --enable-plugin --enable-initfini-array --disable-libgcj --with-isl=/builddir/build/BUILD/gcc-4.8.5-20150702/obj-x86_64-redhat-linux/isl-install --with-cloog=/builddir/build/BUILD/gcc-4.8.5-20150702/obj-x86_64-redhat-linux/cloog-install --enable-gnu-indirect-function --with-tune=generic --with-arch_32=x86-64 --build=x86_64-redhat-linux
Thread model: posix
gcc version 4.8.5 20150623 (Red Hat 4.8.5-44) (GCC) 

And it gave no unexpected symptoms:

[root@dbfd1cc8688d stackoverflow]# g++ -pthread -std=c++11 test.cpp -isystem /opt/boost/include/ -l{crypto,ssl}
[root@dbfd1cc8688d stackoverflow]# ./a.out 

DEBUG: Set host field. 
DEBUG: Set content type.
DEBUG: Set bearer token.
REQUEST: 
POST /apis/sdsendpoints.net/v1/namespaces/namespacesdsendpoints HTTP/1.1
Host: 192.168.50.225
Content-Type: application/json
Authorization: Bearer
Content-Length: 175

{
    "apiVersion": "sdsendpoints.net\/v1",
    "kind": "SdsEndpoint",
    "metadata": {
        "name": "topicName"
    },
    "spec": {
        "endpointURL": "url"
    }
}

Got failure on post endpoint: 404: Not Found : <html>
<head><title>404 Not Found</title></head>
<body bgcolor="white">
<center><h1>404 Not Found</h1></center>
<hr><center>nginx/1.14.0 (Ubuntu)</center>
</body>
</html>

Error in SSL shutdown: short read
[root@dbfd1cc8688d stackoverflow]# 
sehe
  • 374,641
  • 47
  • 450
  • 633