EN · DE · RU · FR · ES

#830: AddressTextParser.kt

projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt Kotlin class, projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt 369 lines · 224 code · 91 comments · 54 blank
Purpose: Source file: projectforge/business/address/AddressTextParser.kt. AddressTextParser.kt is part of the ProjectForge open-source project management application.

Source (first 100 lines)

/////////////////////////////////////////////////////////////////////////////
//
// Project ProjectForge Community Edition
//         www.projectforge.org
//
// Copyright (C) 2001-2026 Micromata GmbH, Germany (www.micromata.com)
//
// ProjectForge is dual-licensed.
//
// This community edition is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License as published
// by the Free Software Foundation; version 3 of the License.
//
// This community edition is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
// Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this program; if not, see http://www.gnu.org/licenses/.
//
/////////////////////////////////////////////////////////////////////////////

package org.projectforge.business.address

import mu.KotlinLogging

private val log = KotlinLogging.logger {}

/**
 * Parser for extracting address information from free text (e.g., email signatures).
 */
object AddressTextParser {


    // Company suffixes
    private val COMPANY_SUFFIXES = listOf(
        "GmbH",
        "AG",
        "e\\.V\\.",
        "KG",
        "OHG",
        "UG",
        "SE",
        "Ltd\\.",
        "Inc\\.",
        "Corp\\.",
        "LLC",
        "PLC"
    )

    // Email regex
    private val EMAIL_REGEX = Regex(
        """[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}""",
        RegexOption.IGNORE_CASE
    )

    // Website regex with known TLDs to avoid false positives like "Dipl.Phys"
    // Note: Longer TLDs are listed before shorter ones to ensure correct matching (e.g., "group" before "gr")
    private val WEBSITE_REGEX = Regex(
        """(?:https?://)?(?:www\.)?[a-zA-Z0-9.-]+\.(?:solutions|company|academy|digital|center|online|store|group|cloud|gmbh|tech|info|shop|app|dev|pro|com|org|net|edu|gov|biz|aero|asia|coop|jobs|mobi|museum|name|post|tel|travel|xxx|de|uk|us|io|co|eu|ch|at|fr|it|es|nl|be|pl|ru|jp|cn|au|ca|nz|se|no|dk|fi|in|br|mx|za|kr|tw|hk|sg|my|th|vn|ph|id|ae|sa|il|tr|gr|cz|sk|hu|ro|bg|hr|si|lt|lv|ee|is|ie|pt|lu|mt|cy|ai)(?:/[^\s]*)?""",
        RegexOption.IGNORE_CASE
    )

    // Phone regex (various formats with flexible separators)
    // Matches phone numbers with digits and common separators (spaces, -, /, ., parentheses)
    // Supports both "Tel:" and "Tel" (with/without colon)
    private val PHONE_REGEX = Regex(
        """(?:Tel\.?:?|Telefon:?|Phone:?|Fon:?|Mobil:?|Mobile:?|Fax:?)\s*(\+?(?:\d+[\s\-./()]*)+\d)""",
        RegexOption.IGNORE_CASE
    )

    // Bare phone regex (phone number without prefix label)
    // Matches phone numbers that start with + or country code and have at least 8 digits
    // This avoids false positives with regular numbers
    private val BARE_PHONE_REGEX = Regex(
        """^\+?(?:\d+[\s\-./()]*){8,}$"""
    )

    // ZIP + City (4-5 digits + city name, optionally with "D-" or "CH-" prefix)
    // Supports German (5 digits), Swiss (4 digits), and other formats
    private val ZIP_CITY_REGEX = Regex(
        """(?:D-|CH-)?(\d{4,5})\s+([A-ZÄÖÜ][a-zäöüß]+(?:[\s-][A-ZÄÖÜ]?[a-zäöüß]+)*)""",
    )

    // Street address (street name + house number)
    private val STREET_REGEX = Regex(
        """([A-ZÄÖÜ][a-zäöüß]+(?:[\s-][A-ZÄÖÜ]?[a-zäöüß]+)*\.?(?:\s+|-)(?:\d+[a-zA-Z]?(?:\s*-\s*\d+[a-zA-Z]?)?))""",
    )

    // Country name (common countries in multiple languages, optionally with second name after /)
    private val COUNTRY_REGEX = Regex(
        """^(Deutschland|Germany|Schweiz|Switzerland|Österreich|Austria|France|Frankreich|Italia?|Italy|UK|USA|United States|United Kingdom|Nederland|Netherlands|Belgique|Belgium|España|Spain|Portugal|Sverige|Sweden|Norge|Norway|Danmark|Denmark|Polska|Poland|Česko|Czech Republic|Slovensko|Slovakia|Magyarország|Hungary|România|Romania|Bulgarien|Bulgaria|Ellinikí Demokratía|Greece|Türkiye|Turkey|Россия|Russia)(?:\s*/\s*(?:Deutschland|Germany|Schweiz|Switzerland|Österreich|Austria|France|Frankreich|Italia?|Italy|UK|USA|United States|United Kingdom|Nederland|Netherlands|Belgique|Belgium|España|Spain|Portugal|Sverige|Sweden|Norge|Norway|Danmark|Denmark|Polska|Poland|Česko|Czech Republic|Slovensko|Slovakia|Magyarország|Hungary|România|Romania|Bulgarien|Bulgaria|Ellinikí Demokratía|Greece|Türkiye|Turkey|Россия|Russia))?$""",
        RegexOption.IGNORE_CASE
    )

    /**
     * Parses free text and extracts address information.
     */
    fun parseAddressText(text: String): ParsedAddressData {

Git History

868d6abb7 2025 -> 2026
30ec0db73 AddressImportReconciler
cab29b70b WIP: AddressTextParser: PersonNameParser introduced for improved parsing of titles, form-of-address etc.
48e37a4c9 AddressTextParser: PersonNameParser introduced for improved parsing of titles, form-of-address etc.
0b1ab35a7 AddressTextParser: PersonNameParser introduced for improved parsing of titles, form-of-address etc.

868d6abb7

2025 -> 2026
868d6abb75cd191a892911ac8e45058932cf9074
diff --git a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
index 6c334b22b..a4d96916a 100644
--- a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
+++ b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
@@ -3,7 +3,7 @@
 // Project ProjectForge Community Edition
 //         www.projectforge.org
 //
-// Copyright (C) 2001-2025 Micromata GmbH, Germany (www.micromata.com)
+// Copyright (C) 2001-2026 Micromata GmbH, Germany (www.micromata.com)
 //
 // ProjectForge is dual-licensed.
 //

30ec0db73

AddressImportReconciler
30ec0db73e57c418559d0d754bfceb90c1997db2
diff --git a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
index c2f8e1068..6c334b22b 100644
--- a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
+++ b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
@@ -70,6 +70,13 @@ object AddressTextParser {
         RegexOption.IGNORE_CASE
     )
 
+    // Bare phone regex (phone number without prefix label)
+    // Matches phone numbers that start with + or country code and have at least 8 digits
+    // This avoids false positives with regular numbers
+    private val BARE_PHONE_REGEX = Regex(
+        """^\+?(?:\d+[\s\-./()]*){8,}$"""
+    )
+
     // ZIP + City (4-5 digits + city name, optionally with "D-" or "CH-" prefix)
     // Supports German (5 digits), Swiss (4 digits), and other formats
     private val ZIP_CITY_REGEX = Regex(
@@ -135,7 +142,7 @@ object AddressTextParser {
                 }
             }
 
-            // Extract phone numbers
+            // Extract phone numbers with prefix (Tel:, Phone:, etc.)
             if (line.matches(Regex(""".*(?:Tel\.?:?|Telefon:?|Phone:?|Fon:?|Mobil:?|Mobile:?|Fax:?).*""", RegexOption.IGNORE_CASE))) {
                 val phoneMatch = PHONE_REGEX.find(line)
                 if (phoneMatch != null) {
@@ -161,6 +168,25 @@ object AddressTextParser {
                 }
             }
 
+            // Extract bare phone numbers (without prefix)
+            if (!processed && BARE_PHONE_REGEX.matches(line)) {
+                val phone = line.trim()
+                phoneNumbers.add(phone)
+
+                // Normalize phone number
+                val normalizedPhone = org.projectforge.framework.utils.PhoneNumberUtils.normalizePhoneNumber(phone)
+
+                // Without label, assume it's a business phone (unless we already have one, then mobile)
+                if (result.businessPhone == null) {
+                    result.businessPhone = normalizedPhone
+                } else if (result.mobilePhone == null) {
+                    result.mobilePhone = normalizedPhone
+                } else if (result.fax == null) {
+                    result.fax = normalizedPhone
+                }
+                processed = true
+            }
+
             // Extract ZIP + City (might be combined with street in same line)
             ZIP_CITY_REGEX.find(line)?.let {
                 if (result.zipCode == null) {

cab29b70b

WIP: AddressTextParser: PersonNameParser introduced for improved parsing of titles, form-of-address etc.
cab29b70bfb32b34408aae8db933f60ec04bc405
diff --git a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
index 9b55d3d6a..c2f8e1068 100644
--- a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
+++ b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
@@ -55,9 +55,10 @@ object AddressTextParser {
         RegexOption.IGNORE_CASE
     )
 
-    // Website regex
+    // Website regex with known TLDs to avoid false positives like "Dipl.Phys"
+    // Note: Longer TLDs are listed before shorter ones to ensure correct matching (e.g., "group" before "gr")
     private val WEBSITE_REGEX = Regex(
-        """(?:https?://)?(?:www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?""",
+        """(?:https?://)?(?:www\.)?[a-zA-Z0-9.-]+\.(?:solutions|company|academy|digital|center|online|store|group|cloud|gmbh|tech|info|shop|app|dev|pro|com|org|net|edu|gov|biz|aero|asia|coop|jobs|mobi|museum|name|post|tel|travel|xxx|de|uk|us|io|co|eu|ch|at|fr|it|es|nl|be|pl|ru|jp|cn|au|ca|nz|se|no|dk|fi|in|br|mx|za|kr|tw|hk|sg|my|th|vn|ph|id|ae|sa|il|tr|gr|cz|sk|hu|ro|bg|hr|si|lt|lv|ee|is|ie|pt|lu|mt|cy|ai)(?:/[^\s]*)?""",
         RegexOption.IGNORE_CASE
     )
 
@@ -118,10 +119,18 @@ object AddressTextParser {
 
             // Extract website (but not email)
             if (!line.contains("@")) {
-                WEBSITE_REGEX.find(line)?.let {
-                    if (result.website == null && !it.value.contains("@")) {
-                        result.website = it.value
-                        processed = true
+                WEBSITE_REGEX.find(line)?.let { match ->
+                    if (result.website == null && !match.value.contains("@")) {
+                        // Additional check: If 2+ words follow the match, it's likely a name with title (e.g., "Dipl.Phys Max Mustermann")
+                        val remainingText = line.substring(match.range.last + 1).trim()
+                        val wordsAfter = remainingText.split(Regex("""\s+""")).filter { it.isNotBlank() }
+
+                        if (wordsAfter.size < 2) {
+                            // Likely a real website
+                            result.website = match.value
+                            processed = true
+                        }
+                        // If 2+ words follow, skip this match (likely a title + name)
                     }
                 }
             }
@@ -285,6 +294,11 @@ object AddressTextParser {
             result.title = parsedName.titles.joinToString(" ")
         }
 
+        // Set form of address (maps to AddressDO.form)
+        if (parsedName.formOfAddress != null) {
+            result.form = parsedName.formOfAddress
+        }
+
         // Set first name and last name
         if (parsedName.firstName.isNotEmpty()) {
             result.firstName = parsedName.firstName

48e37a4c9

AddressTextParser: PersonNameParser introduced for improved parsing of titles, form-of-address etc.
48e37a4c92a17e99e7d8d7440d4d35a1fcabf3ce
diff --git a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
index 307a08040..9b55d3d6a 100644
--- a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
+++ b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
@@ -277,8 +277,8 @@ object AddressTextParser {
             remainingLine = remainingLine.substring(iAMatch.value.length).trim()
         }
 
-        // Use NameParser to extract titles, form of address, first name and last name
-        val parsedName = NameParser.parse(remainingLine)
+        // Use PersonNameParser to extract titles, form of address, first name and last name
+        val parsedName = PersonNameParser.parse(remainingLine)
 
         // Set title (join all titles with space)
         if (parsedName.titles.isNotEmpty()) {

0b1ab35a7

AddressTextParser: PersonNameParser introduced for improved parsing of titles, form-of-address etc.
0b1ab35a73e093d9af27668996430ed46819a4ba
diff --git a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
index fa1e7e4ab..307a08040 100644
--- a/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
+++ b/projectforge-business/src/main/kotlin/org/projectforge/business/address/AddressTextParser.kt
@@ -32,21 +32,6 @@ private val log = KotlinLogging.logger {}
  */
 object AddressTextParser {
 
-    // Common German and English academic/professional titles
-    private val TITLE_PATTERNS = listOf(
-        "Dr\\.",
-        "Prof\\.",
-        "Dipl\\.-Kfm\\.",
-        "Dipl\\.-Ing\\.",
-        "Dipl\\.-Inf\\.",
-        "Dipl\\.",
-        "B\\.Sc\\.",
-        "M\\.Sc\\.",
-        "B\\.A\\.",
-        "M\\.A\\.",
-        "Ph\\.D\\.",
-        "MBA"
-    )
 
     // Company suffixes
     private val COMPANY_SUFFIXES = listOf(
@@ -292,27 +277,20 @@ object AddressTextParser {
             remainingLine = remainingLine.substring(iAMatch.value.length).trim()
         }
 
-        // Extract title if present
-        for (titlePattern in TITLE_PATTERNS) {
-            val titleRegex = Regex("""^($titlePattern)\s*""")
-            val match = titleRegex.find(remainingLine)
-            if (match != null) {
-                result.title = match.groupValues[1]
-                remainingLine = remainingLine.substring(match.value.length).trim()
-                break
-            }
+        // Use NameParser to extract titles, form of address, first name and last name
+        val parsedName = NameParser.parse(remainingLine)
+
+        // Set title (join all titles with space)
+        if (parsedName.titles.isNotEmpty()) {
+            result.title = parsedName.titles.joinToString(" ")
         }
 
-        // Split remaining into first name and last name
-        val nameParts = remainingLine.split(Regex("""\s+"""))
-        when {
-            nameParts.size >= 2 -> {
-                result.firstName = nameParts[0]
-                result.name = nameParts.drop(1).joinToString(" ")
-            }
-            nameParts.size == 1 -> {
-                result.name = nameParts[0]
-            }
+        // Set first name and last name
+        if (parsedName.firstName.isNotEmpty()) {