mod.rs - source (original) (raw)
polars_time/chunkedarray/string/
mod.rs
1pub mod infer;
2use chrono::DateTime;
3mod patterns;
4mod strptime;
5use chrono::ParseError;
6use chrono::format::ParseErrorKind;
7pub use patterns::Pattern;
8#[cfg(feature = "dtype-time")]
9use polars_core::chunked_array::temporal::time_to_time64ns;
10use polars_core::prelude::arity::unary_elementwise;
11use polars_utils::cache::LruCachedFunc;
12
13use super::*;
14#[cfg(feature = "dtype-date")]
15use crate::chunkedarray:๐
:naive_date_to_date;
16use crate::prelude:๐งต:strptime::StrpTimeState;
17
18#[cfg(feature = "dtype-time")]
19fn time_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
20// (string, fmt) -> PolarsResult
21where
22 F: Fn(&str, &str) -> chrono::ParseResult<K>,
23{
24 patterns::TIME_H_M_S
25 .iter()
26 .chain(patterns::TIME_H_M_S)
27 .find(|fmt| convert(val, fmt).is_ok())
28 .copied()
29}
30
31fn datetime_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
32// (string, fmt) -> PolarsResult
33where
34 F: Fn(&str, &str) -> chrono::ParseResult<K>,
35{
36 patterns::DATETIME_Y_M_D
37 .iter()
38 .chain(patterns::DATETIME_D_M_Y)
39 .find(|fmt| convert(val, fmt).is_ok())
40 .copied()
41}
42
43fn date_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
44// (string, fmt) -> PolarsResult
45where
46 F: Fn(&str, &str) -> chrono::ParseResult<K>,
47{
48 patterns::DATE_Y_M_D
49 .iter()
50 .chain(patterns::DATE_D_M_Y)
51 .find(|fmt| convert(val, fmt).is_ok())
52 .copied()
53}
54
55struct ParseErrorByteCopy(ParseErrorKind);
56
57impl From<ParseError> for ParseErrorByteCopy {
58 fn from(e: ParseError) -> Self {
59 ParseErrorByteCopy(e.kind())
60 }
61}
62
63fn get_first_val(ca: &StringChunked) -> PolarsResult<&str> {
64 let idx = ca.first_non_null().ok_or_else(|| {
65 polars_err!(ComputeError:
66 "unable to determine date parsing format, all values are null",
67 )
68 })?;
69 Ok(ca.get(idx).expect("should not be null"))
70}
71
72#[cfg(feature = "dtype-datetime")]
73fn sniff_fmt_datetime(ca_string: &StringChunked) -> PolarsResult<&'static str> {
74 let val = get_first_val(ca_string)?;
75 datetime_pattern(val, NaiveDateTime::parse_from_str)
76 .or_else(|| datetime_pattern(val, NaiveDate::parse_from_str))
77 .ok_or_else(|| polars_err!(parse_fmt_idk = "datetime"))
78}
79
80#[cfg(feature = "dtype-date")]
81fn sniff_fmt_date(ca_string: &StringChunked) -> PolarsResult<&'static str> {
82 let val = get_first_val(ca_string)?;
83 date_pattern(val, NaiveDate::parse_from_str).ok_or_else(|| polars_err!(parse_fmt_idk = "date"))
84}
85
86#[cfg(feature = "dtype-time")]
87fn sniff_fmt_time(ca_string: &StringChunked) -> PolarsResult<&'static str> {
88 let val = get_first_val(ca_string)?;
89 time_pattern(val, NaiveTime::parse_from_str).ok_or_else(|| polars_err!(parse_fmt_idk = "time"))
90}
91
92pub trait StringMethods: AsString {
93 #[cfg(feature = "dtype-time")]
94 /// Parsing string values and return a [`TimeChunked`]
95 fn as_time(&self, fmt: Option<&str>, use_cache: bool) -> PolarsResult<TimeChunked> {
96 let string_ca = self.as_string();
97 let fmt = match fmt {
98 Some(fmt) => fmt,
99 None => sniff_fmt_time(string_ca)?,
100 };
101 let use_cache = use_cache && string_ca.len() > 50;
102
103 let mut convert = LruCachedFunc::new(
104 |s| {
105 let naive_time = NaiveTime::parse_from_str(s, fmt).ok()?;
106 Some(time_to_time64ns(&naive_time))
107 },
108 (string_ca.len() as f64).sqrt() as usize,
109 );
110 let ca = unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache));
111 Ok(ca.with_name(string_ca.name().clone()).into_time())
112 }
113
114 #[cfg(feature = "dtype-date")]
115 /// Parsing string values and return a [`DateChunked`]
116 /// Different from `as_date` this function allows matches that not contain the whole string
117 /// e.g. "foo-2021-01-01-bar" could match "2021-01-01"
118 fn as_date_not_exact(&self, fmt: Option<&str>) -> PolarsResult<DateChunked> {
119 let string_ca = self.as_string();
120 let fmt = match fmt {
121 Some(fmt) => fmt,
122 None => sniff_fmt_date(string_ca)?,
123 };
124 let ca = unary_elementwise(string_ca, |opt_s| {
125 let mut s = opt_s?;
126 let fmt_len = fmt.len();
127
128 for i in 1..(s.len().saturating_sub(fmt_len)) {
129 if s.is_empty() {
130 return None;
131 }
132 match NaiveDate::parse_from_str(s, fmt).map(naive_date_to_date) {
133 Ok(nd) => return Some(nd),
134 Err(e) => match ParseErrorByteCopy::from(e).0 {
135 ParseErrorKind::TooLong => {
136 s = &s[..s.len() - 1];
137 },
138 _ => {
139 s = &s[i..];
140 },
141 },
142 }
143 }
144 None
145 });
146 Ok(ca.with_name(string_ca.name().clone()).into_date())
147 }
148
149 #[cfg(feature = "dtype-datetime")]
150 /// Parsing string values and return a [`DatetimeChunked`]
151 /// Different from `as_datetime` this function allows matches that not contain the whole string
152 /// e.g. "foo-2021-01-01-bar" could match "2021-01-01"
153 fn as_datetime_not_exact(
154 &self,
155 fmt: Option<&str>,
156 tu: TimeUnit,
157 tz_aware: bool,
158 tz: Option<&TimeZone>,
159 _ambiguous: &StringChunked,
160 ) -> PolarsResult<DatetimeChunked> {
161 let string_ca = self.as_string();
162 let fmt = match fmt {
163 Some(fmt) => fmt,
164 None => sniff_fmt_datetime(string_ca)?,
165 };
166
167 let func = match tu {
168 TimeUnit::Nanoseconds => datetime_to_timestamp_ns,
169 TimeUnit::Microseconds => datetime_to_timestamp_us,
170 TimeUnit::Milliseconds => datetime_to_timestamp_ms,
171 };
172
173 let ca = unary_elementwise(string_ca, |opt_s| {
174 let mut s = opt_s?;
175 let fmt_len = fmt.len();
176
177 for i in 1..(s.len().saturating_sub(fmt_len)) {
178 if s.is_empty() {
179 return None;
180 }
181 let timestamp = if tz_aware {
182 DateTime::parse_from_str(s, fmt).map(|dt| func(dt.naive_utc()))
183 } else {
184 NaiveDateTime::parse_from_str(s, fmt).map(func)
185 };
186 match timestamp {
187 Ok(ts) => return Some(ts),
188 Err(e) => {
189 let e: ParseErrorByteCopy = e.into();
190 match e.0 {
191 ParseErrorKind::TooLong => {
192 s = &s[..s.len() - 1];
193 },
194 _ => {
195 s = &s[i..];
196 },
197 }
198 },
199 }
200 }
201 None
202 })
203 .with_name(string_ca.name().clone());
204 match (tz_aware, tz) {
205 #[cfg(feature = "timezones")]
206 (false, Some(tz)) => polars_ops::prelude::replace_time_zone(
207 &ca.into_datetime(tu, None),
208 Some(tz),
209 _ambiguous,
210 NonExistent::Raise,
211 ),
212 #[cfg(feature = "timezones")]
213 (true, tz) => Ok(ca.into_datetime(tu, Some(tz.cloned().unwrap_or(TimeZone::UTC)))),
214 _ => Ok(ca.into_datetime(tu, None)),
215 }
216 }
217
218 #[cfg(feature = "dtype-date")]
219 /// Parsing string values and return a [`DateChunked`]
220 fn as_date(&self, fmt: Option<&str>, use_cache: bool) -> PolarsResult<DateChunked> {
221 let string_ca = self.as_string();
222 let fmt = match fmt {
223 Some(fmt) => fmt,
224 None => return infer::to_date(string_ca),
225 };
226 let use_cache = use_cache && string_ca.len() > 50;
227 let fmt = strptime::compile_fmt(fmt)?;
228
229 // We can use the fast parser.
230 let ca = if let Some(fmt_len) = strptime::fmt_len(fmt.as_bytes()) {
231 let mut strptime_cache = StrpTimeState::default();
232 let mut convert = LruCachedFunc::new(
233 |s: &str| {
234 // SAFETY: fmt_len is correct, it was computed with this `fmt` str.
235 match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) } {
236 // Fallback to chrono.
237 None => NaiveDate::parse_from_str(s, &fmt).ok(),
238 Some(ndt) => Some(ndt.date()),
239 }
240 .map(naive_date_to_date)
241 },
242 (string_ca.len() as f64).sqrt() as usize,
243 );
244 unary_elementwise(string_ca, |val| convert.eval(val?, use_cache))
245 } else {
246 let mut convert = LruCachedFunc::new(
247 |s| {
248 let naive_date = NaiveDate::parse_from_str(s, &fmt).ok()?;
249 Some(naive_date_to_date(naive_date))
250 },
251 (string_ca.len() as f64).sqrt() as usize,
252 );
253 unary_elementwise(string_ca, |val| convert.eval(val?, use_cache))
254 };
255
256 Ok(ca.with_name(string_ca.name().clone()).into_date())
257 }
258
259 #[cfg(feature = "dtype-datetime")]
260 /// Parsing string values and return a [`DatetimeChunked`].
261 fn as_datetime(
262 &self,
263 fmt: Option<&str>,
264 tu: TimeUnit,
265 use_cache: bool,
266 tz_aware: bool,
267 tz: Option<&TimeZone>,
268 ambiguous: &StringChunked,
269 ) -> PolarsResult<DatetimeChunked> {
270 let string_ca = self.as_string();
271 let fmt = match fmt {
272 Some(fmt) => fmt,
273 None => return infer::to_datetime(string_ca, tu, tz, ambiguous),
274 };
275 let fmt = strptime::compile_fmt(fmt)?;
276 let use_cache = use_cache && string_ca.len() > 50;
277
278 let func = match tu {
279 TimeUnit::Nanoseconds => datetime_to_timestamp_ns,
280 TimeUnit::Microseconds => datetime_to_timestamp_us,
281 TimeUnit::Milliseconds => datetime_to_timestamp_ms,
282 };
283
284 if tz_aware {
285 #[cfg(feature = "timezones")]
286 {
287 let mut convert = LruCachedFunc::new(
288 |s: &str| {
289 let dt = DateTime::parse_from_str(s, &fmt).ok()?;
290 Some(func(dt.naive_utc()))
291 },
292 (string_ca.len() as f64).sqrt() as usize,
293 );
294 Ok(
295 unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
296 .with_name(string_ca.name().clone())
297 .into_datetime(tu, Some(tz.cloned().unwrap_or(TimeZone::UTC))),
298 )
299 }
300 #[cfg(not(feature = "timezones"))]
301 {
302 panic!("activate 'timezones' feature")
303 }
304 } else {
305 let transform = match tu {
306 TimeUnit::Nanoseconds => infer::transform_datetime_ns,
307 TimeUnit::Microseconds => infer::transform_datetime_us,
308 TimeUnit::Milliseconds => infer::transform_datetime_ms,
309 };
310 // We can use the fast parser.
311 let ca = if let Some(fmt_len) = self::strptime::fmt_len(fmt.as_bytes()) {
312 let mut strptime_cache = StrpTimeState::default();
313 let mut convert = LruCachedFunc::new(
314 |s: &str| {
315 // SAFETY: fmt_len is correct, it was computed with this `fmt` str.
316 match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) }
317 {
318 None => transform(s, &fmt),
319 Some(ndt) => Some(func(ndt)),
320 }
321 },
322 (string_ca.len() as f64).sqrt() as usize,
323 );
324 unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
325 } else {
326 let mut convert = LruCachedFunc::new(
327 |s| transform(s, &fmt),
328 (string_ca.len() as f64).sqrt() as usize,
329 );
330 unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
331 };
332 let dt = ca
333 .with_name(string_ca.name().clone())
334 .into_datetime(tu, None);
335 match tz {
336 #[cfg(feature = "timezones")]
337 Some(tz) => polars_ops::prelude::replace_time_zone(
338 &dt,
339 Some(tz),
340 ambiguous,
341 NonExistent::Raise,
342 ),
343 _ => Ok(dt),
344 }
345 }
346 }
347}
348
349pub trait AsString {
350 fn as_string(&self) -> &StringChunked;
351}
352
353impl AsString for StringChunked {
354 fn as_string(&self) -> &StringChunked {
355 self
356 }
357}
358
359impl StringMethods for StringChunked {}