Skip to content

Commit

Permalink
mundia.lua: change to regex's
Browse files Browse the repository at this point in the history
  • Loading branch information
Arkiver2 committed Aug 30, 2014
1 parent 3f789a7 commit ccfeece
Showing 1 changed file with 20 additions and 10 deletions.
30 changes: 20 additions & 10 deletions mundia.lua
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ JSON = (loadfile "JSON.lua")()
local url_count = 0
local tries = 0

load_json_file = function(file)
if file then
local f = io.open(file)
local data = f:read("*all")
f:close()
return JSON:decode(data)
else
return nil
end
end

read_file = function(file)
if file then
Expand Down Expand Up @@ -35,14 +45,14 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
local mundia_url = "http://www.mundia.com"

--example url: http://www.mundia.com/us/surnames/aleo
if string.match(url, "%.mundia%.com/[a-z]+/surnames/[a-z0-9%]+") then
if string.match(url, "%.mundia%.com/[^/]+/surnames/[^/]+") then
if not html then
html = read_fie(file)
end

local surname_lower = string.match(url, "%.mundia%.com/[a-z]+/surnames/([a-z0-9%]+)")
local surname_lower = string.match(url, "%.mundia%.com/[^/]+/surnames/([^/]+)")
local surname_upper = string.upper(surname_lower)
local country_code = string.match(url, "%.mundia%.com/([a-z]+)/surnames/[a-z0-9%]+")
local country_code = string.match(url, "%.mundia%.com/([^/]+)/surnames/[^/]+")
--chfoo - is it alright is I add all these urls for all countries?
table.insert(urls, { url="http://www.mundia.com/"..country_code.."/Search/Results?surname="..surname_upper.."&birthPlace=Afghanistan" })
table.insert(urls, { url="http://www.mundia.com/"..country_code.."/Search/Results?surname="..surname_upper.."&birthPlace=Albania" })
Expand Down Expand Up @@ -164,42 +174,42 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
end

--example url: http://www.mundia.com/pk/Search/Results?surname=ABDULA&birthPlace=Verenigde%20Staten
if string.match(url, "%.mundia%.com/[a-z]+/Search/Results?surname=[A-Z]+%&birthPlace=[.]+") then
if string.match(url, "%.mundia%.com/[^/]+/Search/Results?surname=[^/&]+%&birthPlace=[^<>/&]+") then
if not html then
html = read_file(file)
end

--example string: <a href="/pk/Person/5586782/-1432906874" class="">Joseph Sadula Abdula</a>
for person_url in string.gmatch(html, '<a href="(/[a-z]+/Person/[0-9]/[-]?[0-9]+)" class="">[a-z]+</a>') do
for person_url in string.gmatch(html, '<a href="(/[^/]+/Person/[^/]/[^/"& ]+)" class="">[^<>/]+</a>') do
--------------Multiple links possible as results probably - chfoo - help?-------------------
table.insert(urls, { url=mundia_url..person_url })
end

--example string: <a class="tree" href="/pk/Tree/Family/5586782/-1432906874"><span class="view-tree">Stamboom tonen</span></a>
for tree_url in string.gmatch(html, '<a class="[a-z]+" href="(/[a-z]+/Tree/Family/[0-9]/[-]?[0-9]+)"><span class="view-tree">[a-z]+</span></a>') do
for tree_url in string.gmatch(html, '<a class="[^"/<>]+" href="(/[^/]+/Tree/Family/[^/]/[^<>/]+)"><span class="view-tree">[^<>/]+</span></a>') do
--------------Multiple links possible as results probably - chfoo - help?-------------------
table.insert(urls, { url=mundia_url..tree_url })
end

--example string: <img src="http://mediasvc.ancestry.com/v2/image/namespaces/1093/media/11f96e77-c39c-4ca4-b659-32f67aa8d129.jpg?client=TreeService&MaxSide=96" width="68" alt="Foto" /></a>
for person_image in string.gmatch(html, '<img src="(http://mediasvc%.ancestry%.com/v[0-9]+/image/namespaces/[0-9]+/media/[a-z0-9-]+%.jpg%?client=TreeService&MaxSide=[0-9]+)" width="[0-9]+" alt="Foto" /></a>') do
for person_image in string.gmatch(html, '<img src="(http://mediasvc%.ancestry%.com/v[^/]+/image/namespaces/[^/]+/media/[^/%.]+%.jpg%?client=TreeService&MaxSide=[^"]+)" width="[^"]+" alt="[^"]+" /></a>') do
--------------Multiple links possible as results probably - chfoo - help?-------------------
table.insert(urls, { url=person_image })
for person_image_big in string.gmatch(person_image, "(http://mediasvc%.ancestry%.com/v[0-9]+/image/namespaces/[0-9]+/media/[a-z0-9-]+%.jpg%?client=TreeService)&MaxSide=[0-9]+") do
for person_image_big in string.gmatch(person_image, "(http://mediasvc%.ancestry%.com/v[^/]+/image/namespaces/[^/]+/media/[^/%.]+%.jpg%?client=TreeService)&MaxSide=[.]+") do
table.insert(urls, { url=person_image_big })
end
end
end

--example url: http://www.mundia.com/us/Person/743375/6809259973
--example url: http://www.mundia.com/us/Person/12748608/-190814136
if string.match(url, "%.mundia%.com/[a-z]+/Person/[0-9]+/[-]?[0-9]+") then
if string.match(url, "%.mundia%.com/[^/]+/Person/[^/]+/[^<>/&]+") then
if not html then
html = read_file(file)
end

--example string: href="/pk/Messages?sendMessageTo=0120cac9-0003-0000-0000-000000000000&subject=Joseph%2BSadula%2BAbdula"
for adding_user in string.gmatch(html, 'href="(/[a-z]+/Messages%?sendMessageTo=[0-9-]+&subject=[a-z%]+)"') do
for adding_user in string.gmatch(html, 'href="(/[^/]+/Messages%?sendMessageTo=[^&]+&subject=[^"]+)"') do
end

end
Expand Down

0 comments on commit ccfeece

Please sign in to comment.