mod.rs - source (original) (raw)

polars_time/chunkedarray/string/

mod.rs

1pub mod infer;
2use chrono::DateTime;
3mod patterns;
4mod strptime;
5use chrono::ParseError;
6use chrono::format::ParseErrorKind;
7pub use patterns::Pattern;
8#[cfg(feature = "dtype-time")]
9use polars_core::chunked_array::temporal::time_to_time64ns;
10use polars_core::prelude::arity::unary_elementwise;
11use polars_utils::cache::LruCachedFunc;
12
13use super::*;
14#[cfg(feature = "dtype-date")]
15use crate::chunkedarray:๐Ÿ“…:naive_date_to_date;
16use crate::prelude:๐Ÿงต:strptime::StrpTimeState;
17
18#[cfg(feature = "dtype-time")]
19fn time_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
20// (string, fmt) -> PolarsResult
21where
22    F: Fn(&str, &str) -> chrono::ParseResult<K>,
23{
24    patterns::TIME_H_M_S
25        .iter()
26        .chain(patterns::TIME_H_M_S)
27        .find(|fmt| convert(val, fmt).is_ok())
28        .copied()
29}
30
31fn datetime_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
32// (string, fmt) -> PolarsResult
33where
34    F: Fn(&str, &str) -> chrono::ParseResult<K>,
35{
36    patterns::DATETIME_Y_M_D
37        .iter()
38        .chain(patterns::DATETIME_D_M_Y)
39        .find(|fmt| convert(val, fmt).is_ok())
40        .copied()
41}
42
43fn date_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
44// (string, fmt) -> PolarsResult
45where
46    F: Fn(&str, &str) -> chrono::ParseResult<K>,
47{
48    patterns::DATE_Y_M_D
49        .iter()
50        .chain(patterns::DATE_D_M_Y)
51        .find(|fmt| convert(val, fmt).is_ok())
52        .copied()
53}
54
55struct ParseErrorByteCopy(ParseErrorKind);
56
57impl From<ParseError> for ParseErrorByteCopy {
58    fn from(e: ParseError) -> Self {
59        ParseErrorByteCopy(e.kind())
60    }
61}
62
63fn get_first_val(ca: &StringChunked) -> PolarsResult<&str> {
64    let idx = ca.first_non_null().ok_or_else(|| {
65        polars_err!(ComputeError:
66            "unable to determine date parsing format, all values are null",
67        )
68    })?;
69    Ok(ca.get(idx).expect("should not be null"))
70}
71
72#[cfg(feature = "dtype-datetime")]
73fn sniff_fmt_datetime(ca_string: &StringChunked) -> PolarsResult<&'static str> {
74    let val = get_first_val(ca_string)?;
75    datetime_pattern(val, NaiveDateTime::parse_from_str)
76        .or_else(|| datetime_pattern(val, NaiveDate::parse_from_str))
77        .ok_or_else(|| polars_err!(parse_fmt_idk = "datetime"))
78}
79
80#[cfg(feature = "dtype-date")]
81fn sniff_fmt_date(ca_string: &StringChunked) -> PolarsResult<&'static str> {
82    let val = get_first_val(ca_string)?;
83    date_pattern(val, NaiveDate::parse_from_str).ok_or_else(|| polars_err!(parse_fmt_idk = "date"))
84}
85
86#[cfg(feature = "dtype-time")]
87fn sniff_fmt_time(ca_string: &StringChunked) -> PolarsResult<&'static str> {
88    let val = get_first_val(ca_string)?;
89    time_pattern(val, NaiveTime::parse_from_str).ok_or_else(|| polars_err!(parse_fmt_idk = "time"))
90}
91
92pub trait StringMethods: AsString {
93    #[cfg(feature = "dtype-time")]
94    /// Parsing string values and return a [`TimeChunked`]
95    fn as_time(&self, fmt: Option<&str>, use_cache: bool) -> PolarsResult<TimeChunked> {
96        let string_ca = self.as_string();
97        let fmt = match fmt {
98            Some(fmt) => fmt,
99            None => sniff_fmt_time(string_ca)?,
100        };
101        let use_cache = use_cache && string_ca.len() > 50;
102
103        let mut convert = LruCachedFunc::new(
104            |s| {
105                let naive_time = NaiveTime::parse_from_str(s, fmt).ok()?;
106                Some(time_to_time64ns(&naive_time))
107            },
108            (string_ca.len() as f64).sqrt() as usize,
109        );
110        let ca = unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache));
111        Ok(ca.with_name(string_ca.name().clone()).into_time())
112    }
113
114    #[cfg(feature = "dtype-date")]
115    /// Parsing string values and return a [`DateChunked`]
116    /// Different from `as_date` this function allows matches that not contain the whole string
117    /// e.g. "foo-2021-01-01-bar" could match "2021-01-01"
118    fn as_date_not_exact(&self, fmt: Option<&str>) -> PolarsResult<DateChunked> {
119        let string_ca = self.as_string();
120        let fmt = match fmt {
121            Some(fmt) => fmt,
122            None => sniff_fmt_date(string_ca)?,
123        };
124        let ca = unary_elementwise(string_ca, |opt_s| {
125            let mut s = opt_s?;
126            let fmt_len = fmt.len();
127
128            for i in 1..(s.len().saturating_sub(fmt_len)) {
129                if s.is_empty() {
130                    return None;
131                }
132                match NaiveDate::parse_from_str(s, fmt).map(naive_date_to_date) {
133                    Ok(nd) => return Some(nd),
134                    Err(e) => match ParseErrorByteCopy::from(e).0 {
135                        ParseErrorKind::TooLong => {
136                            s = &s[..s.len() - 1];
137                        },
138                        _ => {
139                            s = &s[i..];
140                        },
141                    },
142                }
143            }
144            None
145        });
146        Ok(ca.with_name(string_ca.name().clone()).into_date())
147    }
148
149    #[cfg(feature = "dtype-datetime")]
150    /// Parsing string values and return a [`DatetimeChunked`]
151    /// Different from `as_datetime` this function allows matches that not contain the whole string
152    /// e.g. "foo-2021-01-01-bar" could match "2021-01-01"
153    fn as_datetime_not_exact(
154        &self,
155        fmt: Option<&str>,
156        tu: TimeUnit,
157        tz_aware: bool,
158        tz: Option<&TimeZone>,
159        _ambiguous: &StringChunked,
160    ) -> PolarsResult<DatetimeChunked> {
161        let string_ca = self.as_string();
162        let fmt = match fmt {
163            Some(fmt) => fmt,
164            None => sniff_fmt_datetime(string_ca)?,
165        };
166
167        let func = match tu {
168            TimeUnit::Nanoseconds => datetime_to_timestamp_ns,
169            TimeUnit::Microseconds => datetime_to_timestamp_us,
170            TimeUnit::Milliseconds => datetime_to_timestamp_ms,
171        };
172
173        let ca = unary_elementwise(string_ca, |opt_s| {
174            let mut s = opt_s?;
175            let fmt_len = fmt.len();
176
177            for i in 1..(s.len().saturating_sub(fmt_len)) {
178                if s.is_empty() {
179                    return None;
180                }
181                let timestamp = if tz_aware {
182                    DateTime::parse_from_str(s, fmt).map(|dt| func(dt.naive_utc()))
183                } else {
184                    NaiveDateTime::parse_from_str(s, fmt).map(func)
185                };
186                match timestamp {
187                    Ok(ts) => return Some(ts),
188                    Err(e) => {
189                        let e: ParseErrorByteCopy = e.into();
190                        match e.0 {
191                            ParseErrorKind::TooLong => {
192                                s = &s[..s.len() - 1];
193                            },
194                            _ => {
195                                s = &s[i..];
196                            },
197                        }
198                    },
199                }
200            }
201            None
202        })
203        .with_name(string_ca.name().clone());
204        match (tz_aware, tz) {
205            #[cfg(feature = "timezones")]
206            (false, Some(tz)) => polars_ops::prelude::replace_time_zone(
207                &ca.into_datetime(tu, None),
208                Some(tz),
209                _ambiguous,
210                NonExistent::Raise,
211            ),
212            #[cfg(feature = "timezones")]
213            (true, tz) => Ok(ca.into_datetime(tu, Some(tz.cloned().unwrap_or(TimeZone::UTC)))),
214            _ => Ok(ca.into_datetime(tu, None)),
215        }
216    }
217
218    #[cfg(feature = "dtype-date")]
219    /// Parsing string values and return a [`DateChunked`]
220    fn as_date(&self, fmt: Option<&str>, use_cache: bool) -> PolarsResult<DateChunked> {
221        let string_ca = self.as_string();
222        let fmt = match fmt {
223            Some(fmt) => fmt,
224            None => return infer::to_date(string_ca),
225        };
226        let use_cache = use_cache && string_ca.len() > 50;
227        let fmt = strptime::compile_fmt(fmt)?;
228
229        // We can use the fast parser.
230        let ca = if let Some(fmt_len) = strptime::fmt_len(fmt.as_bytes()) {
231            let mut strptime_cache = StrpTimeState::default();
232            let mut convert = LruCachedFunc::new(
233                |s: &str| {
234                    // SAFETY: fmt_len is correct, it was computed with this `fmt` str.
235                    match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) } {
236                        // Fallback to chrono.
237                        None => NaiveDate::parse_from_str(s, &fmt).ok(),
238                        Some(ndt) => Some(ndt.date()),
239                    }
240                    .map(naive_date_to_date)
241                },
242                (string_ca.len() as f64).sqrt() as usize,
243            );
244            unary_elementwise(string_ca, |val| convert.eval(val?, use_cache))
245        } else {
246            let mut convert = LruCachedFunc::new(
247                |s| {
248                    let naive_date = NaiveDate::parse_from_str(s, &fmt).ok()?;
249                    Some(naive_date_to_date(naive_date))
250                },
251                (string_ca.len() as f64).sqrt() as usize,
252            );
253            unary_elementwise(string_ca, |val| convert.eval(val?, use_cache))
254        };
255
256        Ok(ca.with_name(string_ca.name().clone()).into_date())
257    }
258
259    #[cfg(feature = "dtype-datetime")]
260    /// Parsing string values and return a [`DatetimeChunked`].
261    fn as_datetime(
262        &self,
263        fmt: Option<&str>,
264        tu: TimeUnit,
265        use_cache: bool,
266        tz_aware: bool,
267        tz: Option<&TimeZone>,
268        ambiguous: &StringChunked,
269    ) -> PolarsResult<DatetimeChunked> {
270        let string_ca = self.as_string();
271        let fmt = match fmt {
272            Some(fmt) => fmt,
273            None => return infer::to_datetime(string_ca, tu, tz, ambiguous),
274        };
275        let fmt = strptime::compile_fmt(fmt)?;
276        let use_cache = use_cache && string_ca.len() > 50;
277
278        let func = match tu {
279            TimeUnit::Nanoseconds => datetime_to_timestamp_ns,
280            TimeUnit::Microseconds => datetime_to_timestamp_us,
281            TimeUnit::Milliseconds => datetime_to_timestamp_ms,
282        };
283
284        if tz_aware {
285            #[cfg(feature = "timezones")]
286            {
287                let mut convert = LruCachedFunc::new(
288                    |s: &str| {
289                        let dt = DateTime::parse_from_str(s, &fmt).ok()?;
290                        Some(func(dt.naive_utc()))
291                    },
292                    (string_ca.len() as f64).sqrt() as usize,
293                );
294                Ok(
295                    unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
296                        .with_name(string_ca.name().clone())
297                        .into_datetime(tu, Some(tz.cloned().unwrap_or(TimeZone::UTC))),
298                )
299            }
300            #[cfg(not(feature = "timezones"))]
301            {
302                panic!("activate 'timezones' feature")
303            }
304        } else {
305            let transform = match tu {
306                TimeUnit::Nanoseconds => infer::transform_datetime_ns,
307                TimeUnit::Microseconds => infer::transform_datetime_us,
308                TimeUnit::Milliseconds => infer::transform_datetime_ms,
309            };
310            // We can use the fast parser.
311            let ca = if let Some(fmt_len) = self::strptime::fmt_len(fmt.as_bytes()) {
312                let mut strptime_cache = StrpTimeState::default();
313                let mut convert = LruCachedFunc::new(
314                    |s: &str| {
315                        // SAFETY: fmt_len is correct, it was computed with this `fmt` str.
316                        match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) }
317                        {
318                            None => transform(s, &fmt),
319                            Some(ndt) => Some(func(ndt)),
320                        }
321                    },
322                    (string_ca.len() as f64).sqrt() as usize,
323                );
324                unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
325            } else {
326                let mut convert = LruCachedFunc::new(
327                    |s| transform(s, &fmt),
328                    (string_ca.len() as f64).sqrt() as usize,
329                );
330                unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
331            };
332            let dt = ca
333                .with_name(string_ca.name().clone())
334                .into_datetime(tu, None);
335            match tz {
336                #[cfg(feature = "timezones")]
337                Some(tz) => polars_ops::prelude::replace_time_zone(
338                    &dt,
339                    Some(tz),
340                    ambiguous,
341                    NonExistent::Raise,
342                ),
343                _ => Ok(dt),
344            }
345        }
346    }
347}
348
349pub trait AsString {
350    fn as_string(&self) -> &StringChunked;
351}
352
353impl AsString for StringChunked {
354    fn as_string(&self) -> &StringChunked {
355        self
356    }
357}
358
359impl StringMethods for StringChunked {}