main.py and config.yaml are left out from updates, only examples are provided. Change in Readme too
This commit is contained in:
parent
4ec9f76080
commit
0808e5a42d
4 changed files with 203 additions and 1 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,3 +1,5 @@
|
||||||
|
spiders/config.yaml
|
||||||
|
main.py
|
||||||
/venv
|
/venv
|
||||||
/spiders/pages/**
|
/spiders/pages/**
|
||||||
/spiders/output/**
|
/spiders/output/**
|
||||||
|
|
14
README.md
14
README.md
|
@ -80,6 +80,20 @@ pip install -r requirements.txt
|
||||||
|
|
||||||
# Usage
|
# Usage
|
||||||
|
|
||||||
|
Use it step by step. First care for the htmls of the lists of the links.
|
||||||
|
Then care for getting the first json output from the first layer of html
|
||||||
|
pages.
|
||||||
|
|
||||||
|
Copy the two examples to the file name, in which they will be
|
||||||
|
input to the spider
|
||||||
|
|
||||||
|
```
|
||||||
|
cp main.py_example main.py
|
||||||
|
|
||||||
|
cp spiders/config.yaml_example spiders/config.yaml
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
## Configuration File Syntax
|
## Configuration File Syntax
|
||||||
|
|
||||||
The configuration file with working syntax template is
|
The configuration file with working syntax template is
|
||||||
|
|
28
main.py_example
Normal file
28
main.py_example
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
from spiders.fdb_spider import *
|
||||||
|
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
config = "spiders/config.yaml"
|
||||||
|
#list_of_fdbs = eval(sys.argv[1])
|
||||||
|
#list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
|
||||||
|
#list_of_fdbs = ["giz","evergabe-online"]
|
||||||
|
#list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
|
||||||
|
list_of_fdbs = ["ted.europa.eu"]
|
||||||
|
#list_of_fdbs = ["dtvp"]
|
||||||
|
|
||||||
|
|
||||||
|
# doing the crawling of government websites
|
||||||
|
|
||||||
|
spider = fdb_spider(config)
|
||||||
|
|
||||||
|
spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
||||||
|
|
||||||
|
#spider.find_config_parameter(list_of_fdbs)
|
||||||
|
|
||||||
|
spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
|
#spider.download_entry_data_htmls(list_of_fdbs)
|
||||||
|
|
||||||
|
#spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||||
|
|
158
spiders/config.yaml_example
Normal file
158
spiders/config.yaml_example
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue