diff options
Diffstat (limited to 'tvix/tracing/src/lib.rs')
-rw-r--r-- | tvix/tracing/src/lib.rs | 232 |
1 files changed, 188 insertions, 44 deletions
diff --git a/tvix/tracing/src/lib.rs b/tvix/tracing/src/lib.rs index 8ad90835338e..312c9a5a9a62 100644 --- a/tvix/tracing/src/lib.rs +++ b/tvix/tracing/src/lib.rs @@ -1,5 +1,6 @@ use indicatif::ProgressStyle; use lazy_static::lazy_static; +use tokio::sync::{mpsc, oneshot}; use tracing::Level; use tracing_indicatif::{filter::IndicatifFilter, IndicatifLayer}; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Layer}; @@ -24,20 +25,136 @@ lazy_static! { .expect("invalid progress template"); } -// using a macro_rule here because of the complex return type -macro_rules! init_base_subscriber { - ($level: expr) => {{ - let indicatif_layer = IndicatifLayer::new().with_progress_style(PB_SPINNER_STYLE.clone()); +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error(transparent)] + Init(#[from] tracing_subscriber::util::TryInitError), + + #[error(transparent)] + MpscSend(#[from] mpsc::error::SendError<Option<oneshot::Sender<()>>>), + + #[error(transparent)] + OneshotRecv(#[from] oneshot::error::RecvError), +} + +#[derive(Clone)] +pub struct TracingHandle { + tx: Option<mpsc::Sender<Option<oneshot::Sender<()>>>>, +} + +impl TracingHandle { + /// This will flush possible attached tracing providers, e.g. otlp exported, if enabled. + /// If there is none enabled this will result in a noop. + /// + /// It will not wait until the flush is complete, but you can pass in an oneshot::Sender which + /// will receive a message once the flush is completed. + pub async fn flush(&self, msg: Option<oneshot::Sender<()>>) -> Result<(), Error> { + if let Some(tx) = &self.tx { + Ok(tx.send(msg).await?) + } else { + // If we have a message passed in we need to notify the receiver + if let Some(tx) = msg { + let _ = tx.send(()); + } + Ok(()) + } + } + /// This will flush all all attached tracing providers and will wait until the flush is completed. + /// If no tracing providers like otlp are attached then this will be a noop. + /// + /// This should only be called on a regular shutdown. + /// If you correctly need to shutdown tracing on ctrl_c use [force_shutdown](#method.force_shutdown) + /// otherwise you will get otlp errors. + pub async fn shutdown(&self) -> Result<(), Error> { + let (tx, rx) = tokio::sync::oneshot::channel(); + self.flush(Some(tx)).await?; + rx.await?; + Ok(()) + } + + /// This will flush all all attached tracing providers and will wait until the flush is completed. + /// After this it will do some other necessary cleanup. + /// If no tracing providers like otlp are attached then this will be a noop. + /// + /// This should only be used if the tool received an ctrl_c otherwise you will get otlp errors. + /// If you need to shutdown tracing on a regular exit, you should use the [shutdown](#method.shutdown) + /// method. + pub async fn force_shutdown(&self) -> Result<(), Error> { + let (tx, rx) = tokio::sync::oneshot::channel(); + self.flush(Some(tx)).await?; + rx.await?; + + #[cfg(feature = "otlp")] + { + // Because of a bug within otlp we currently have to use spawn_blocking otherwise + // calling `shutdown_tracer_provider` can block forever. See + // https://github.com/open-telemetry/opentelemetry-rust/issues/1395#issuecomment-1953280335 + // + // This still throws an error, if the tool exits regularly: "OpenTelemetry trace error + // occurred. oneshot canceled", but not having this leads to errors if we cancel with + // ctrl_c. + // So this should right now only be used on ctrl_c, for a regular exit use the + // [shutdown](#shutdown) method + let _ = tokio::task::spawn_blocking(move || { + opentelemetry::global::shutdown_tracer_provider(); + }) + .await; + } + + Ok(()) + } +} + +pub struct TracingBuilder { + level: Level, + + #[cfg(feature = "otlp")] + service_name: Option<&'static str>, +} + +impl Default for TracingBuilder { + fn default() -> Self { + TracingBuilder { + level: Level::INFO, + + #[cfg(feature = "otlp")] + service_name: None, + } + } +} + +impl TracingBuilder { + /// Set the log level of the stderr writer, RUST_LOG still has a higher priority over this + pub fn level(mut self, level: Level) -> TracingBuilder { + self.level = level; + self + } + + #[cfg(feature = "otlp")] + /// Enable otlp by setting a custom service_name + pub fn enable_otlp(mut self, service_name: &'static str) -> TracingBuilder { + self.service_name = Some(service_name); + self + } + + /// This will setup tracing based on the configuration passed in. + /// It will setup a stderr writer with the provided log level as filter (RUST_LOG still has a + /// higher priority over the configured value) + /// + /// It will also configure otlp if the feature is enabled and a service_name was provided. It + /// will then correctly setup a channel which is later used for flushing the provider. + pub fn build(self) -> Result<TracingHandle, Error> { // Set up the tracing subscriber. - tracing_subscriber::registry() + let indicatif_layer = IndicatifLayer::new().with_progress_style(PB_SPINNER_STYLE.clone()); + let subscriber = tracing_subscriber::registry() .with( tracing_subscriber::fmt::Layer::new() .with_writer(indicatif_layer.get_stderr_writer()) .compact() .with_filter( EnvFilter::builder() - .with_default_directive($level.into()) + .with_default_directive(self.level.into()) .from_env() .expect("invalid RUST_LOG"), ), @@ -45,46 +162,73 @@ macro_rules! init_base_subscriber { .with(indicatif_layer.with_filter( // only show progress for spans with indicatif.pb_show field being set IndicatifFilter::new(false), - )) - }}; -} + )); -pub fn init(level: Level) -> Result<(), tracing_subscriber::util::TryInitError> { - init_base_subscriber!(level).try_init() -} + // Setup otlp if a service_name is configured + #[cfg(feature = "otlp")] + { + if let Some(service_name) = self.service_name { + let tracer = opentelemetry_otlp::new_pipeline() + .tracing() + .with_exporter(opentelemetry_otlp::new_exporter().tonic()) + .with_batch_config(BatchConfig::default()) + .with_trace_config(opentelemetry_sdk::trace::config().with_resource({ + // use SdkProvidedResourceDetector.detect to detect resources, + // but replace the default service name with our default. + // https://github.com/open-telemetry/opentelemetry-rust/issues/1298 + let resources = + SdkProvidedResourceDetector.detect(std::time::Duration::from_secs(0)); + // SdkProvidedResourceDetector currently always sets + // `service.name`, but we don't like its default. + if resources.get("service.name".into()).unwrap() == "unknown_service".into() + { + resources.merge(&Resource::new([KeyValue::new( + "service.name", + service_name, + )])) + } else { + resources + } + })) + .install_batch(opentelemetry_sdk::runtime::Tokio) + .expect("Failed to install batch exporter using Tokio"); -#[cfg(feature = "otlp")] -pub fn init_with_otlp( - level: Level, - service_name: &'static str, -) -> Result<(), tracing_subscriber::util::TryInitError> { - let subscriber = init_base_subscriber!(level); - - let tracer = opentelemetry_otlp::new_pipeline() - .tracing() - .with_exporter(opentelemetry_otlp::new_exporter().tonic()) - .with_batch_config(BatchConfig::default()) - .with_trace_config(opentelemetry_sdk::trace::config().with_resource({ - // use SdkProvidedResourceDetector.detect to detect resources, - // but replace the default service name with our default. - // https://github.com/open-telemetry/opentelemetry-rust/issues/1298 - let resources = SdkProvidedResourceDetector.detect(std::time::Duration::from_secs(0)); - // SdkProvidedResourceDetector currently always sets - // `service.name`, but we don't like its default. - if resources.get("service.name".into()).unwrap() == "unknown_service".into() { - resources.merge(&Resource::new([KeyValue::new( - "service.name", - service_name, - )])) - } else { - resources - } - })) - .install_batch(opentelemetry_sdk::runtime::Tokio) - .expect("Failed to install tokio runtime"); + // Trace provider is need for later use like flushing the provider. + // Needs to be kept around for each message to rx we need to handle. + let tracer_provider = tracer + .provider() + .expect("Failed to get the tracer provider"); - // Create a tracing layer with the configured tracer - let layer = tracing_opentelemetry::layer().with_tracer(tracer); + // Set up a channel for flushing trace providers later + let (tx, mut rx) = mpsc::channel::<Option<oneshot::Sender<()>>>(16); + + // Spawning a task that listens on rx for any message. Once we receive a message we + // correctly call flush on the tracer_provider. + tokio::spawn(async move { + while let Some(m) = rx.recv().await { + // Because of a bug within otlp we currently have to use spawn_blocking + // otherwise will calling `force_flush` block forever, especially if the + // tool was closed with ctrl_c. See + // https://github.com/open-telemetry/opentelemetry-rust/issues/1395#issuecomment-1953280335 + let _ = tokio::task::spawn_blocking({ + let tracer_provider = tracer_provider.clone(); + move || tracer_provider.force_flush() + }) + .await; + if let Some(tx) = m { + let _ = tx.send(()); + } + } + }); + + // Create a tracing layer with the configured tracer + let layer = tracing_opentelemetry::layer().with_tracer(tracer); + subscriber.with(Some(layer)).try_init()?; + return Ok(TracingHandle { tx: Some(tx) }); + } + } - subscriber.with(Some(layer)).try_init() + subscriber.try_init()?; + Ok(TracingHandle { tx: None }) + } } |