#830: AddressTextParser.kt
projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt Класс Kotlin, projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt 369 строк · 224 кода · 91 комментарий · 54 пустых
Назначение: Исходный файл: projectforge/business/address/AddressTextParser.kt. AddressTextParser.kt является частью приложения для управления проектами с открытым исходным кодом ProjectForge.
Исходный код (первые 100 строк)
/////////////////////////////////////////////////////////////////////////////
//
// Project ProjectForge Community Edition
// www.projectforge.org
//
// Copyright (C) 2001-2026 Micromata GmbH, Germany (www.micromata.com)
//
// ProjectForge is dual-licensed.
//
// This community edition is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License as published
// by the Free Software Foundation; version 3 of the License.
//
// This community edition is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
// Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this program; if not, see http://www.gnu.org/licenses/.
//
/////////////////////////////////////////////////////////////////////////////
package org.projectforge.business.address
import mu.KotlinLogging
private val log = KotlinLogging.logger {}
/**
* Парсер для извлечения адресной информации из произвольного текста (например, подписей электронных писем).
*/
object AddressTextParser {
// Суффиксы компаний
private val COMPANY_SUFFIXES = listOf(
"GmbH",
"AG",
"e\\.V\\.",
"KG",
"OHG",
"UG",
"SE",
"Ltd\\.",
"Inc\\.",
"Corp\\.",
"LLC",
"PLC"
)
// Регулярное выражение для email
private val EMAIL_REGEX = Regex(
"""[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}""",
RegexOption.IGNORE_CASE
)
// Регулярное выражение для веб-сайта с известными доменами верхнего уровня для избежания ложных срабатываний, таких как "Dipl.Phys"
// Примечание: более длинные домены верхнего уровня указаны перед более короткими для обеспечения корректного сопоставления (например, "group" перед "gr")
private val WEBSITE_REGEX = Regex(
"""(?:https?://)?(?:www\.)?[a-zA-Z0-9.-]+\.(?:solutions|company|academy|digital|center|online|store|group|cloud|gmbh|tech|info|shop|app|dev|pro|com|org|net|edu|gov|biz|aero|asia|coop|jobs|mobi|museum|name|post|tel|travel|xxx|de|uk|us|io|co|eu|ch|at|fr|it|es|nl|be|pl|ru|jp|cn|au|ca|nz|se|no|dk|fi|in|br|mx|za|kr|tw|hk|sg|my|th|vn|ph|id|ae|sa|il|tr|gr|cz|sk|hu|ro|bg|hr|si|lt|lv|ee|is|ie|pt|lu|mt|cy|ai)(?:/[^\s]*)?""",
RegexOption.IGNORE_CASE
)
// Регулярное выражение для телефона (различные форматы с гибкими разделителями)
// Соответствует номерам телефонов с цифрами и распространенными разделителями (пробелы, -, /, ., скобки)
// Поддерживает как "Tel:", так и "Tel" (с двоеточием и без)
private val PHONE_REGEX = Regex(
"""(?:Tel\.?:?|Telefon:?|Phone:?|Fon:?|Mobil:?|Mobile:?|Fax:?)\s*(\+?(?:\d+[\s\-./()]*)+\d)""",
RegexOption.IGNORE_CASE
)
// Регулярное выражение для номера телефона без префикса
// Соответствует номерам телефонов, которые начинаются с + или кода страны и содержат не менее 8 цифр
// Это позволяет избежать ложных срабатываний с обычными числами
private val BARE_PHONE_REGEX = Regex(
"""^\+?(?:\d+[\s\-./()]*){8,}$"""
)
// Индекс + Город (4-5 цифр + название города, опционально с префиксом "D-" или "CH-")
// Поддерживает немецкий (5 цифр), швейцарский (4 цифры) и другие форматы
private val ZIP_CITY_REGEX = Regex(
"""(?:D-|CH-)?(\d{4,5})\s+([A-ZÄÖÜ][a-zäöüß]+(?:[\s-][A-ZÄÖÜ]?[a-zäöüß]+)*)""",
)
// Улица (название улицы + номер дома)
private val STREET_REGEX = Regex(
"""([A-ZÄÖÜ][a-zäöüß]+(?:[\s-][A-ZÄÖÜ]?[a-zäöüß]+)*\.?(?:\s+|-)(?:\d+[a-zA-Z]?(?:\s*-\s*\d+[a-zA-Z]?)?))""",
)
// Название страны (распространенные страны на нескольких языках, опционально с вторым названием после /)
private val COUNTRY_REGEX = Regex(
"""^(Deutschland|Germany|Schweiz|Switzerland|Österreich|Austria|France|Frankreich|Italia?|Italy|UK|USA|United States|United Kingdom|Nederland|Netherlands|Belgique|Belgium|España|Spain|Portugal|Sverige|Sweden|Norge|Norway|Danmark|Denmark|Polska|Poland|Česko|Czech Republic|Slovensko|Slovakia|Magyarország|Hungary|România|Romania|Bulgarien|Bulgaria|Ellinikí Demokratía|Greece|Türkiye|Turkey|Россия|Russia)(?:\s*/\s*(?:Deutschland|Germany|Schweiz|Switzerland|Österreich|Austria|France|Frankreich|Italia?|Italy|UK|USA|United States|United Kingdom|Nederland|Netherlands|Belgique|Belgium|España|Spain|Portugal|Sverige|Sweden|Norge|Norway|Danmark|Denmark|Polska|Poland|Česko|Czech Republic|Slovensko|Slovakia|Magyarország|Hungary|România|Romania|Bulgarien|Bulgaria|Ellinikí Demokratía|Greece|Türkiye|Turkey|Россия|Russia))?$""",
RegexOption.IGNORE_CASE
)
/**
* Анализирует произвольный текст и извлекает адресную информацию.
*/
fun parseAddressText(text: String): ParsedAddressData {
История Git
868d6abb7 2025 -> 2026
30ec0db73 AddressImportReconciler
cab29b70b WIP: AddressTextParser: PersonNameParser introduced for improved parsing of titles, form-of-address etc.
48e37a4c9 AddressTextParser: PersonNameParser introduced for improved parsing of titles, form-of-address etc.
0b1ab35a7 AddressTextParser: PersonNameParser introduced for improved parsing of titles, form-of-address etc.
868d6abb7
2025 -> 2026868d6abb75cd191a892911ac8e45058932cf9074
diff --git a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
index 6c334b22b..a4d96916a 100644
--- a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
+++ b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
@@ -3,7 +3,7 @@
// Project ProjectForge Community Edition
// www.projectforge.org
//
-// Copyright (C) 2001-2025 Micromata GmbH, Germany (www.micromata.com)
+// Copyright (C) 2001-2026 Micromata GmbH, Germany (www.micromata.com)
//
// ProjectForge is dual-licensed.
//
30ec0db73
AddressImportReconciler30ec0db73e57c418559d0d754bfceb90c1997db2
diff --git a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
index c2f8e1068..6c334b22b 100644
--- a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
+++ b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
@@ -70,6 +70,13 @@ object AddressTextParser {
RegexOption.IGNORE_CASE
)
+ // Bare phone regex (phone number without prefix label)
+ // Matches phone numbers that start with + or country code and have at least 8 digits
+ // This avoids false positives with regular numbers
+ private val BARE_PHONE_REGEX = Regex(
+ """^\+?(?:\d+[\s\-./()]*){8,}$"""
+ )
+
// ZIP + City (4-5 digits + city name, optionally with "D-" or "CH-" prefix)
// Supports German (5 digits), Swiss (4 digits), and other formats
private val ZIP_CITY_REGEX = Regex(
@@ -135,7 +142,7 @@ object AddressTextParser {
}
}
- // Extract phone numbers
+ // Extract phone numbers with prefix (Tel:, Phone:, etc.)
if (line.matches(Regex(""".*(?:Tel\.?:?|Telefon:?|Phone:?|Fon:?|Mobil:?|Mobile:?|Fax:?).*""", RegexOption.IGNORE_CASE))) {
val phoneMatch = PHONE_REGEX.find(line)
if (phoneMatch != null) {
@@ -161,6 +168,25 @@ object AddressTextParser {
}
}
+ // Extract bare phone numbers (without prefix)
+ if (!processed && BARE_PHONE_REGEX.matches(line)) {
+ val phone = line.trim()
+ phoneNumbers.add(phone)
+
+ // Normalize phone number
+ val normalizedPhone = org.projectforge.framework.utils.PhoneNumberUtils.normalizePhoneNumber(phone)
+
+ // Without label, assume it's a business phone (unless we already have one, then mobile)
+ if (result.businessPhone == null) {
+ result.businessPhone = normalizedPhone
+ } else if (result.mobilePhone == null) {
+ result.mobilePhone = normalizedPhone
+ } else if (result.fax == null) {
+ result.fax = normalizedPhone
+ }
+ processed = true
+ }
+
// Extract ZIP + City (might be combined with street in same line)
ZIP_CITY_REGEX.find(line)?.let {
if (result.zipCode == null) { cab29b70b
WIP: AddressTextParser: PersonNameParser introduced for improved parsing of titles, form-of-address etc.cab29b70bfb32b34408aae8db933f60ec04bc405
diff --git a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
index 9b55d3d6a..c2f8e1068 100644
--- a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
+++ b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
@@ -55,9 +55,10 @@ object AddressTextParser {
RegexOption.IGNORE_CASE
)
- // Website regex
+ // Website regex with known TLDs to avoid false positives like "Dipl.Phys"
+ // Note: Longer TLDs are listed before shorter ones to ensure correct matching (e.g., "group" before "gr")
private val WEBSITE_REGEX = Regex(
- """(?:https?://)?(?:www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?""",
+ """(?:https?://)?(?:www\.)?[a-zA-Z0-9.-]+\.(?:solutions|company|academy|digital|center|online|store|group|cloud|gmbh|tech|info|shop|app|dev|pro|com|org|net|edu|gov|biz|aero|asia|coop|jobs|mobi|museum|name|post|tel|travel|xxx|de|uk|us|io|co|eu|ch|at|fr|it|es|nl|be|pl|ru|jp|cn|au|ca|nz|se|no|dk|fi|in|br|mx|za|kr|tw|hk|sg|my|th|vn|ph|id|ae|sa|il|tr|gr|cz|sk|hu|ro|bg|hr|si|lt|lv|ee|is|ie|pt|lu|mt|cy|ai)(?:/[^\s]*)?""",
RegexOption.IGNORE_CASE
)
@@ -118,10 +119,18 @@ object AddressTextParser {
// Extract website (but not email)
if (!line.contains("@")) {
- WEBSITE_REGEX.find(line)?.let {
- if (result.website == null && !it.value.contains("@")) {
- result.website = it.value
- processed = true
+ WEBSITE_REGEX.find(line)?.let { match ->
+ if (result.website == null && !match.value.contains("@")) {
+ // Additional check: If 2+ words follow the match, it's likely a name with title (e.g., "Dipl.Phys Max Mustermann")
+ val remainingText = line.substring(match.range.last + 1).trim()
+ val wordsAfter = remainingText.split(Regex("""\s+""")).filter { it.isNotBlank() }
+
+ if (wordsAfter.size < 2) {
+ // Likely a real website
+ result.website = match.value
+ processed = true
+ }
+ // If 2+ words follow, skip this match (likely a title + name)
}
}
}
@@ -285,6 +294,11 @@ object AddressTextParser {
result.title = parsedName.titles.joinToString(" ")
}
+ // Set form of address (maps to AddressDO.form)
+ if (parsedName.formOfAddress != null) {
+ result.form = parsedName.formOfAddress
+ }
+
// Set first name and last name
if (parsedName.firstName.isNotEmpty()) {
result.firstName = parsedName.firstName 48e37a4c9
AddressTextParser: PersonNameParser introduced for improved parsing of titles, form-of-address etc.48e37a4c92a17e99e7d8d7440d4d35a1fcabf3ce
diff --git a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
index 307a08040..9b55d3d6a 100644
--- a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
+++ b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
@@ -277,8 +277,8 @@ object AddressTextParser {
remainingLine = remainingLine.substring(iAMatch.value.length).trim()
}
- // Use NameParser to extract titles, form of address, first name and last name
- val parsedName = NameParser.parse(remainingLine)
+ // Use PersonNameParser to extract titles, form of address, first name and last name
+ val parsedName = PersonNameParser.parse(remainingLine)
// Set title (join all titles with space)
if (parsedName.titles.isNotEmpty()) { 0b1ab35a7
AddressTextParser: PersonNameParser introduced for improved parsing of titles, form-of-address etc.0b1ab35a73e093d9af27668996430ed46819a4ba
diff --git a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
index fa1e7e4ab..307a08040 100644
--- a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
+++ b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
@@ -32,21 +32,6 @@ private val log = KotlinLogging.logger {}
*/
object AddressTextParser {
- // Common German and English academic/professional titles
- private val TITLE_PATTERNS = listOf(
- "Dr\\.",
- "Prof\\.",
- "Dipl\\.-Kfm\\.",
- "Dipl\\.-Ing\\.",
- "Dipl\\.-Inf\\.",
- "Dipl\\.",
- "B\\.Sc\\.",
- "M\\.Sc\\.",
- "B\\.A\\.",
- "M\\.A\\.",
- "Ph\\.D\\.",
- "MBA"
- )
// Company suffixes
private val COMPANY_SUFFIXES = listOf(
@@ -292,27 +277,20 @@ object AddressTextParser {
remainingLine = remainingLine.substring(iAMatch.value.length).trim()
}
- // Extract title if present
- for (titlePattern in TITLE_PATTERNS) {
- val titleRegex = Regex("""^($titlePattern)\s*""")
- val match = titleRegex.find(remainingLine)
- if (match != null) {
- result.title = match.groupValues[1]
- remainingLine = remainingLine.substring(match.value.length).trim()
- break
- }
+ // Use NameParser to extract titles, form of address, first name and last name
+ val parsedName = NameParser.parse(remainingLine)
+
+ // Set title (join all titles with space)
+ if (parsedName.titles.isNotEmpty()) {
+ result.title = parsedName.titles.joinToString(" ")
}
- // Split remaining into first name and last name
- val nameParts = remainingLine.split(Regex("""\s+"""))
- when {
- nameParts.size >= 2 -> {
- result.firstName = nameParts[0]
- result.name = nameParts.drop(1).joinToString(" ")
- }
- nameParts.size == 1 -> {
- result.name = nameParts[0]
- }
+ // Set first name and last name
+ if (parsedName.firstName.isNotEmpty()) {