1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
|
<?php
// Name Specific Functions
//
// webtrees: Web based Family History software
// Copyright (C) 2011 webtrees development team.
//
// Derived from PhpGedView
// Copyright (C) 2002 to 2009 PGV Development Team. All rights reserved.
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// $Id$
if (!defined('WT_WEBTREES')) {
header('HTTP/1.0 403 Forbidden');
exit;
}
/**
* Takes a string and converts certain characters in the string to others for the purpose of soundex searches
*/
function Character_Substitute($input)
{
$stringsToReplace = array("/AE/", "/ae/", "/OE/", "/oe/", "/UE/", "/ue/", "/ss/", "/SS/");
$replacements = array("Ä", "ä", "Ö", "ö", "Ü", "ü", "ß", "ß");
preg_replace($stringsToReplace, $replacements, $input);
}
/**
* Get array of common surnames
*
* This function returns a simple array of the most common surnames
* found in the individuals list.
* @param int $min the number of times a surname must occur before it is added to the array
*/
function get_common_surnames($min) {
$COMMON_NAMES_ADD =get_gedcom_setting(WT_GED_ID, 'COMMON_NAMES_ADD');
$COMMON_NAMES_REMOVE=get_gedcom_setting(WT_GED_ID, 'COMMON_NAMES_REMOVE');
$topsurns=get_top_surnames(WT_GED_ID, $min, 0);
foreach (explode(',', $COMMON_NAMES_ADD) as $surname) {
if ($surname && !array_key_exists($surname, $topsurns)) {
$topsurns[$surname]=$min;
}
}
foreach (explode(',', $COMMON_NAMES_REMOVE) as $surname) {
unset($topsurns[utf8_strtoupper($surname)]);
}
//-- check if we found some, else recurse
if (empty($topsurns) && $min>2) {
return get_common_surnames($min/2);
} else {
uksort($topsurns, 'utf8_strcasecmp');
foreach ($topsurns as $key=>$value) {
$topsurns[$key]=array('name'=>$key, 'match'=>$value);
}
return $topsurns;
}
}
/**
* strip name prefixes
*
* this function strips the prefixes of lastnames
* get rid of jr. Jr. Sr. sr. II, III and van, van der, de lowercase surname prefixes
* a . and space must be behind a-z to ensure shortened prefixes and multiple prefixes are removed
* @param string $lastname The name to strip
* @return string The updated name
*/
function strip_prefix($lastname) {
$name = preg_replace(array('/ [jJsS][rR]\.?,/', '/ I+,/', '/^([a-z]{1,4}[\. \_\-\(\[])+/'), array(',',',',''), $lastname);
$name = trim($name);
if ($name=='') return $lastname;
return $name;
}
/**
* This function replaces @N.N. and @P.N. with the language specific translations
* @param mixed $names $names could be an array of name parts or it could be a string of the name
* @return string
*/
function check_NN($names) {
global $UNDERLINE_NAME_QUOTES;
global $UNKNOWN_NN, $UNKNOWN_PN;
$fullname = '';
if (!is_array($names)) {
$names = str_replace(array(' /','/,','/'), array(' ', ',', ' '), $names);
$names = str_replace(array('@N.N.','@P.N.'), array($UNKNOWN_NN,$UNKNOWN_PN), trim($names));
//-- underline names with a * at the end
//-- see this forum thread http://sourceforge.net/forum/forum.php?thread_id=1223099&forum_id=185165
if ($UNDERLINE_NAME_QUOTES) {
$names = preg_replace('/"(.+)"/', '<span class="starredname">$1</span>', $names);
}
$names = preg_replace('/([^ ]+)\*/', '<span class="starredname">$1</span>', $names);
return $names;
}
if (count($names) == 2 && stristr($names[0], '@N.N.') && stristr($names[1], '@N.N.')) {
$fullname = $UNKNOWN_NN.' + '.$UNKNOWN_NN;
} else {
for ($i=0; $i<count($names); $i++) {
$unknown = false;
if (stristr($names[$i], '@N.N.')) {
$unknown = true;
$names[$i] = str_replace('@N.N.', $UNKNOWN_NN, trim($names[$i]));
}
if (stristr($names[$i], '@P.N.')) {
$names[$i] = $UNKNOWN_PN;
}
if ($i==1 && $unknown && count($names)==3) {
$fullname .= ', ';
} elseif ($i==2 && $unknown && count($names)==3) {
$fullname .= ' + ';
} elseif ($i==2 && stristr($names[2], 'Individual ') && count($names) == 3) {
$fullname .= ' + ';
} elseif ($i==2 && count($names)>3) {
$fullname .= ' + ';
} else {
$fullname .= ', ';
}
$fullname .= trim($names[$i]);
}
}
$fullname = trim($fullname);
if (substr($fullname,-1)==',') $fullname = substr($fullname,0,strlen($fullname)-1);
if (substr($fullname,0,2)==', ') $fullname = substr($fullname,2);
$fullname = trim($fullname);
if (empty($fullname)) return $UNKNOWN_NN;
return $fullname;
}
/**
* determine the Daitch-Mokotoff Soundex code for a word
* @param string $name The name
* @return array The array of codes
* @author G. Kroll (canajun2eh), after a previous implementation by Boudewijn Sjouke
*/
function DMSoundex($name) {
global $transformNameTable, $dmsounds, $maxchar;
// If the code tables are not loaded, reload! Keep them global!
if (!defined('WT_DMSOUNDS_UTF8_PHP')) {
require WT_ROOT.'includes/dmsounds_UTF8.php';
}
// Apply special transformation rules to the input string
$name = utf8_strtoupper($name);
foreach ($transformNameTable as $transformRule) {
$name = str_replace($transformRule[0], $transformRule[1], $name);
}
// Initialize
$name_script = utf8_script($name);
if ($name_script == 'hebrew' || $name_script == 'arabic') $noVowels = true;
else $noVowels = false;
$lastPos = strlen($name) - 1;
$currPos = 0;
$state = 1; // 1: start of input string, 2: before vowel, 3: other
$result = array(); // accumulate complete 6-digit D-M codes here
$partialResult = array(); // accumulate incomplete D-M codes here
$partialResult[] = array('!'); // initialize 1st partial result ('!' stops "duplicate sound" check)
// Loop through the input string.
// Stop when the string is exhausted or when no more partial results remain
while (count($partialResult) !=0 && $currPos <= $lastPos) {
// Find the DM coding table entry for the chunk at the current position
$thisEntry = substr($name, $currPos, $maxchar); // Get maximum length chunk
while ($thisEntry != '') {
if (isset($dmsounds[$thisEntry])) break;
$thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk
}
if ($thisEntry == '') {
$currPos ++; // Not in table: advance pointer to next byte
continue; // and try again
}
$soundTableEntry = $dmsounds[$thisEntry];
$workingResult = $partialResult;
$partialResult = array();
$currPos += strlen($thisEntry);
if ($state != 1) { // Not at beginning of input string
if ($currPos <= $lastPos) {
// Determine whether the next chunk is a vowel
$nextEntry = substr($name, $currPos, $maxchar); // Get maximum length chunk
while ($nextEntry != '') {
if (isset($dmsounds[$nextEntry])) break;
$nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk
}
} else $nextEntry = '';
if ($nextEntry != '' && $dmsounds[$nextEntry][0] != '0') $state = 2; // Next chunk is a vowel
else $state = 3;
}
while ($state < count($soundTableEntry)) {
if ($soundTableEntry[$state] == '') { // empty means 'ignore this sound in this state'
foreach ($workingResult as $workingEntry) {
$tempEntry = $workingEntry;
$tempEntry[count($tempEntry)-1] .= '!'; // Prevent false 'doubles'
$partialResult[] = $tempEntry;
}
} else {
foreach ($workingResult as $workingEntry) {
if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry)-1]) {
// Incoming sound isn't a duplicate of the previous sound
$workingEntry[] = $soundTableEntry[$state];
} else {
// Incoming sound is a duplicate of the previous sound
// For Hebrew and Arabic, we need to create a pair of D-M sound codes,
// one of the pair with only a single occurrence of the duplicate sound,
// the other with both occurrences
if ($noVowels) {
//$partialResult[] = $workingEntry;
$workingEntry[] = $soundTableEntry[$state];
}
}
if (count($workingEntry) < 7) $partialResult[] = $workingEntry;
else {
// This is the 6th code in the sequence
// We're looking for 7 entries because the first is '!' and doesn't count
$tempResult = str_replace('!', '', implode('', $workingEntry)) . '000000';
$result[] = substr($tempResult, 0, 6);
}
}
}
$state = $state + 3; // Advance to next triplet while keeping the same basic state
}
}
// Zero-fill and copy all remaining partial results
foreach ($partialResult as $workingEntry) {
$tempResult = str_replace('!', '', implode('', $workingEntry)) . '000000';
$result[] = substr($tempResult, 0, 6);
}
$result = array_flip(array_flip($result)); // Kill the double results in the array
// We're done. All that's left is to sort the result
sort($result);
return $result;
}
// Wrapper function for soundex function. Return a colon separated list of values.
function soundex_std($text) {
Character_Substitute($text);
$words=explode(' ', $text);
$soundex_array=array();
foreach ($words as $word) {
if ($word) {
$soundex_array[]=soundex($word);
}
}
if (count($words)>1) {
$soundex_array[]=soundex(strtr($text, ' ', ''));
}
// A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters)
$soundex_array=array_slice($soundex_array, 0, 51);
return implode(':', array_unique($soundex_array));
}
// Wrapper function for soundex function. Return a colon separated list of values.
function soundex_dm($text) {
Character_Substitute($text);
$words=explode(' ', $text);
$soundex_array=array();
$combined = '';
foreach ($words as $word) {
if ($word) {
$soundex_array=array_merge($soundex_array, DMSoundex($word));
}
}
if (count($words)>1) {
$soundex_array=array_merge($soundex_array, DMSoundex(strtr($text, ' ', '')));
}
// A varchar(255) column can only hold 36 6-entries (plus 35 delimiters)
$soundex_array=array_slice($soundex_array, 0, 36);
return implode(':', array_unique($soundex_array));
}
|