Fix issues 3

This commit is contained in:
Urtzi Alfaro
2025-08-17 13:35:05 +02:00
parent d21094a940
commit cafd316c4b
5 changed files with 157 additions and 49 deletions

View File

@@ -865,34 +865,65 @@ class DataImportService:
return mapping
def _parse_date(self, date_str: str) -> Optional[datetime]:
"""Enhanced date parsing with pandas and multiple format support"""
"""Enhanced date parsing with explicit format handling for CSV dates"""
if not date_str or str(date_str).lower() in ['nan', 'null', 'none']:
return None
date_str = str(date_str).strip()
# Try pandas first (most robust)
# For CSV format like "2024/10/01", try specific formats first to avoid ambiguity
# Priority order: YYYY/MM/DD (most likely for machine-generated data)
priority_formats = [
'%Y/%m/%d', # 2024/10/01 (October 1, 2024) - most likely for CSV exports
'%Y-%m-%d', # 2024-10-01
'%d/%m/%Y', # 01/10/2024 (European format)
'%m/%d/%Y', # 10/01/2024 (US format)
]
# Try priority formats first
for fmt in priority_formats:
try:
parsed_dt = datetime.strptime(date_str, fmt)
if parsed_dt.tzinfo is None:
parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
logger.debug(f"Successfully parsed date '{date_str}' using format '{fmt}' -> {parsed_dt}")
return parsed_dt
except ValueError:
continue
# Try pandas as fallback with explicit format inference
try:
parsed_dt = pd.to_datetime(date_str, dayfirst=True)
# For YYYY/MM/DD format, disable dayfirst to prevent misinterpretation
if '/' in date_str and len(date_str.split('/')[0]) == 4:
# Looks like YYYY/MM/DD format, so don't use dayfirst
parsed_dt = pd.to_datetime(date_str, dayfirst=False)
else:
# For other formats, use dayfirst=True for European-style dates
parsed_dt = pd.to_datetime(date_str, dayfirst=True)
if hasattr(parsed_dt, 'to_pydatetime'):
parsed_dt = parsed_dt.to_pydatetime()
if parsed_dt.tzinfo is None:
parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
logger.debug(f"Successfully parsed date '{date_str}' using pandas -> {parsed_dt}")
return parsed_dt
except Exception:
except Exception as e:
logger.debug(f"Pandas date parsing failed for '{date_str}': {e}")
pass
# Try specific formats as fallback
# Try remaining formats as last fallback
for fmt in self.DATE_FORMATS:
try:
parsed_dt = datetime.strptime(date_str, fmt)
if parsed_dt.tzinfo is None:
parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
return parsed_dt
except ValueError:
continue
if fmt not in priority_formats: # Skip already tried formats
try:
parsed_dt = datetime.strptime(date_str, fmt)
if parsed_dt.tzinfo is None:
parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
logger.debug(f"Successfully parsed date '{date_str}' using fallback format '{fmt}' -> {parsed_dt}")
return parsed_dt
except ValueError:
continue
logger.warning(f"Could not parse date: {date_str}")
return None