Browse Source

corrected error which arised in logic of wget backup get

master
alpcentaur 9 months ago
parent
commit
5d17f4e421
10 changed files with 11 additions and 17439 deletions
  1. BIN
      spiders/__pycache__/fdb_spider.cpython-39.pyc
  2. +11
    -8
      spiders/fdb_spider.py
  3. +0
    -2351
      spiders/pages/foerderinfo.bund.de1entryList.html
  4. +0
    -2271
      spiders/pages/foerderinfo.bund.de2entryList.html
  5. +0
    -2179
      spiders/pages/foerderinfo.bund.de3entryList.html
  6. +0
    -2186
      spiders/pages/foerderinfo.bund.de4entryList.html
  7. +0
    -2185
      spiders/pages/foerderinfo.bund.de5entryList.html
  8. +0
    -2178
      spiders/pages/foerderinfo.bund.de6entryList.html
  9. +0
    -2173
      spiders/pages/foerderinfo.bund.de7entryList.html
  10. +0
    -1908
      spiders/pages/foerderinfo.bund.de8entryList.html

BIN
spiders/__pycache__/fdb_spider.cpython-39.pyc View File


+ 11
- 8
spiders/fdb_spider.py View File

@ -510,6 +510,7 @@ class fdb_spider(object):
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)
for entry_id in dictionary_entry_list:
print(entry_id)
entry_link = dictionary_entry_list[entry_id]["link"]
web_content = 'NONE'
# download the html page of the entry
@ -556,14 +557,14 @@ class fdb_spider(object):
if 'javascript' not in entry_link and '.pdf' not in entry_link:
print('blabuuuuuba')
#print('oi')
try:
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
url = entry_link
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
response = urllib.request.urlopen(req)
#print('response from first one', response)
print('response from first one', response)
except Exception as e:
print('cookie giving then downloading did not work, original error is:', e)
try:
@ -604,7 +605,7 @@ class fdb_spider(object):
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
wget_wrote = False
if web_content == 'NONE':
print('other downloading approaches did not work, trying requests')
@ -623,14 +624,16 @@ class fdb_spider(object):
try:
os.makedirs(os.path.dirname(file_name), exist_ok=True)
oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name])
wget_wrote = True
except subprocess.CalledProcessError:
print('wget downloading did not work.. saving NONE to file now')
os.makedirs(os.path.dirname(file_name), exist_ok=True)
f = open(file_name, "w+")
f.write(web_content)
f.close
if wget_wrote == False:
os.makedirs(os.path.dirname(file_name), exist_ok=True)
f = open(file_name, "w+")
f.write(web_content)
f.close
# save the entry_domain, implemented first for further downloads in javascript links
f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")

+ 0
- 2351
spiders/pages/foerderinfo.bund.de1entryList.html
File diff suppressed because it is too large
View File


+ 0
- 2271
spiders/pages/foerderinfo.bund.de2entryList.html
File diff suppressed because it is too large
View File


+ 0
- 2179
spiders/pages/foerderinfo.bund.de3entryList.html
File diff suppressed because it is too large
View File


+ 0
- 2186
spiders/pages/foerderinfo.bund.de4entryList.html
File diff suppressed because it is too large
View File


+ 0
- 2185
spiders/pages/foerderinfo.bund.de5entryList.html
File diff suppressed because it is too large
View File


+ 0
- 2178
spiders/pages/foerderinfo.bund.de6entryList.html
File diff suppressed because it is too large
View File


+ 0
- 2173
spiders/pages/foerderinfo.bund.de7entryList.html
File diff suppressed because it is too large
View File


+ 0
- 1908
spiders/pages/foerderinfo.bund.de8entryList.html
File diff suppressed because it is too large
View File


Loading…
Cancel
Save