Browse Source

corrected error which arised in logic of wget backup get

master
alpcentaur 11 months ago
parent
commit
5d17f4e421
10 changed files with 11 additions and 17439 deletions
  1. BIN
      spiders/__pycache__/fdb_spider.cpython-39.pyc
  2. +11
    -8
      spiders/fdb_spider.py
  3. +0
    -2351
      spiders/pages/foerderinfo.bund.de1entryList.html
  4. +0
    -2271
      spiders/pages/foerderinfo.bund.de2entryList.html
  5. +0
    -2179
      spiders/pages/foerderinfo.bund.de3entryList.html
  6. +0
    -2186
      spiders/pages/foerderinfo.bund.de4entryList.html
  7. +0
    -2185
      spiders/pages/foerderinfo.bund.de5entryList.html
  8. +0
    -2178
      spiders/pages/foerderinfo.bund.de6entryList.html
  9. +0
    -2173
      spiders/pages/foerderinfo.bund.de7entryList.html
  10. +0
    -1908
      spiders/pages/foerderinfo.bund.de8entryList.html

BIN
spiders/__pycache__/fdb_spider.cpython-39.pyc View File


+ 11
- 8
spiders/fdb_spider.py View File

@ -510,6 +510,7 @@ class fdb_spider(object):
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2) driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)
for entry_id in dictionary_entry_list: for entry_id in dictionary_entry_list:
print(entry_id)
entry_link = dictionary_entry_list[entry_id]["link"] entry_link = dictionary_entry_list[entry_id]["link"]
web_content = 'NONE' web_content = 'NONE'
# download the html page of the entry # download the html page of the entry
@ -556,14 +557,14 @@ class fdb_spider(object):
if 'javascript' not in entry_link and '.pdf' not in entry_link: if 'javascript' not in entry_link and '.pdf' not in entry_link:
print('blabuuuuuba')
#print('oi') #print('oi')
try: try:
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects # defining cookie to not end up in endless loop because of cookie banners pointing to redirects
url = entry_link url = entry_link
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'}) req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
response = urllib.request.urlopen(req) response = urllib.request.urlopen(req)
#print('response from first one', response)
print('response from first one', response)
except Exception as e: except Exception as e:
print('cookie giving then downloading did not work, original error is:', e) print('cookie giving then downloading did not work, original error is:', e)
try: try:
@ -604,7 +605,7 @@ class fdb_spider(object):
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
wget_wrote = False
if web_content == 'NONE': if web_content == 'NONE':
print('other downloading approaches did not work, trying requests') print('other downloading approaches did not work, trying requests')
@ -623,14 +624,16 @@ class fdb_spider(object):
try: try:
os.makedirs(os.path.dirname(file_name), exist_ok=True) os.makedirs(os.path.dirname(file_name), exist_ok=True)
oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name]) oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name])
wget_wrote = True
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
print('wget downloading did not work.. saving NONE to file now') print('wget downloading did not work.. saving NONE to file now')
os.makedirs(os.path.dirname(file_name), exist_ok=True)
f = open(file_name, "w+")
f.write(web_content)
f.close
if wget_wrote == False:
os.makedirs(os.path.dirname(file_name), exist_ok=True)
f = open(file_name, "w+")
f.write(web_content)
f.close
# save the entry_domain, implemented first for further downloads in javascript links # save the entry_domain, implemented first for further downloads in javascript links
f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+") f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")

+ 0
- 2351
spiders/pages/foerderinfo.bund.de1entryList.html
File diff suppressed because it is too large
View File


+ 0
- 2271
spiders/pages/foerderinfo.bund.de2entryList.html
File diff suppressed because it is too large
View File


+ 0
- 2179
spiders/pages/foerderinfo.bund.de3entryList.html
File diff suppressed because it is too large
View File


+ 0
- 2186
spiders/pages/foerderinfo.bund.de4entryList.html
File diff suppressed because it is too large
View File


+ 0
- 2185
spiders/pages/foerderinfo.bund.de5entryList.html
File diff suppressed because it is too large
View File


+ 0
- 2178
spiders/pages/foerderinfo.bund.de6entryList.html
File diff suppressed because it is too large
View File


+ 0
- 2173
spiders/pages/foerderinfo.bund.de7entryList.html
File diff suppressed because it is too large
View File


+ 0
- 1908
spiders/pages/foerderinfo.bund.de8entryList.html
File diff suppressed because it is too large
View File


Loading…
Cancel
Save