diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..6538b58 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,37 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'pdf2doi' +copyright = '2024, Michele Cotrufo' +author = 'Michele Cotrufo' +release = '1.6' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'sphinx_autodoc_typehints', +] + +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +# html_theme = 'alabaster' +html_theme = 'sphinx_rtd_theme' +html_static_path = ['_static'] + +import sys, os +sys.path.insert(0, os.path.abspath('../pdf2doi')) diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..4090bc9 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,22 @@ +.. pdf2doi documentation master file, created by + sphinx-quickstart on Sat Jul 13 15:27:07 2024. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to pdf2doi's documentation! +=================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + source/modules + + + +.. Indices and tables +.. ================== + +.. * :ref:`genindex` +.. * :ref:`modindex` +.. * :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..954237b --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/modules.rst b/docs/source/modules.rst new file mode 100644 index 0000000..ba87db8 --- /dev/null +++ b/docs/source/modules.rst @@ -0,0 +1,7 @@ +pdf2doi +======= + +.. toctree:: + :maxdepth: 4 + + pdf2doi diff --git a/docs/source/pdf2doi.rst b/docs/source/pdf2doi.rst new file mode 100644 index 0000000..c1df59e --- /dev/null +++ b/docs/source/pdf2doi.rst @@ -0,0 +1,53 @@ +pdf2doi package +=============== + +Submodules +---------- + +pdf2doi.config module +--------------------- + +.. automodule:: pdf2doi.config + :members: + :undoc-members: + :show-inheritance: + +pdf2doi.find\_title\_via\_pymupdf module +---------------------------------------- + +.. automodule:: pdf2doi.find_title_via_pymupdf + :members: + :undoc-members: + :show-inheritance: + +pdf2doi.finders module +---------------------- + +.. automodule:: pdf2doi.finders + :members: + :undoc-members: + :show-inheritance: + +pdf2doi.main module +------------------- + +.. automodule:: pdf2doi.main + :members: + :undoc-members: + :show-inheritance: + +pdf2doi.patterns module +----------------------- + +.. automodule:: pdf2doi.patterns + :members: + :undoc-members: + :show-inheritance: + +pdf2doi.utils\_registry module +------------------------------ + +.. automodule:: pdf2doi.utils_registry + :members: + :undoc-members: + :show-inheritance: diff --git a/pdf2doi/find_title_via_pymupdf.py b/pdf2doi/find_title_via_pymupdf.py index 4591aea..93dffc1 100644 --- a/pdf2doi/find_title_via_pymupdf.py +++ b/pdf2doi/find_title_via_pymupdf.py @@ -34,6 +34,7 @@ def fonts(doc, granularity=False): def font_tags(font_counts, styles): """Returns dictionary with font sizes as keys and tags as value. + :param font_counts: (font_size, count) for all fonts occuring in document :type font_counts: list :param styles: all styles found in the document @@ -67,6 +68,7 @@ def font_tags(font_counts, styles): def headers_para(doc, size_tag): """Scrapes headers & paragraphs from PDF and return texts with element tags. + :param doc: PDF document to iterate through :type doc: :param size_tag: textual element tags for each size diff --git a/pdf2doi/main.py b/pdf2doi/main.py index c6390ba..9666576 100644 --- a/pdf2doi/main.py +++ b/pdf2doi/main.py @@ -10,13 +10,16 @@ # import pyperclip def pdf2doi(target): - ''' This is the main routine of the library. When the library is used as a command-line tool (via the entry-point "pdf2doi") the input arguments + r''' This is the main routine of the library. When the library is used as a command-line tool (via the entry-point "pdf2doi") the input arguments are collected, validated and sent to this function (see the function main() below). The function tries to extract the DOI (or other identifiers) of the publication in the pdf files whose path is specified in the input variable target. If target contains the valid path of a folder, the function tries to extract the DOI/identifer of all pdf files in the folder. It returns a dictionary (or a list of dictionaries) containing info(s) about the file(s) examined, or None if an error occurred. Example: + + .. code-block:: python + import pdf2doi path = r"Path\to\folder" result = pdf2doi.pdf2doi(path) @@ -35,12 +38,12 @@ def pdf2doi(target): The output is a single dictionary if target is a file, or a list of dictionaries if target is a directory, each element of the list describing one file. Each dictionary has the following keys - result['identifier'] = DOI or other identifier (or None if nothing is found) - result['identifier_type'] = string specifying the type of identifier (e.g. 'doi' or 'arxiv') - result['validation_info'] = Additional info on the paper. If config.get('webvalidation') = True, then result['validation_info'] - will typically contain raw bibtex data for this paper. Otherwise it will just contain True - result['path'] = path of the pdf file - result['method'] = method used to find the identifier + - result['identifier'] = DOI or other identifier (or None if nothing is found) + - result['identifier_type'] = string specifying the type of identifier (e.g. 'doi' or 'arxiv') + - result['validation_info'] = Additional info on the paper. If config.get('webvalidation') = True, then result['validation_info'] + will typically contain raw bibtex data for this paper. Otherwise it will just contain True + - result['path'] = path of the pdf file + - result['method'] = method used to find the identifier ''' @@ -119,12 +122,12 @@ def pdf2doi_singlefile(file): result, dictionary The output is a single dictionary with the following keys - result['identifier'] = DOI or other identifier (or None if nothing is found) - result['identifier_type'] = string specifying the type of identifier (e.g. 'doi' or 'arxiv') - result['validation_info'] = Additional info on the paper. If config.get('webvalidation') = True, then result['validation_info'] - will typically contain raw bibtex data for this paper. Otherwise it will just contain True - result['path'] = path of the pdf file - result['method'] = method used to find the identifier + - result['identifier'] = DOI or other identifier (or None if nothing is found) + - result['identifier_type'] = string specifying the type of identifier (e.g. 'doi' or 'arxiv') + - result['validation_info'] = Additional info on the paper. If config.get('webvalidation') = True, then result['validation_info'] + will typically contain raw bibtex data for this paper. Otherwise it will just contain True + - result['path'] = path of the pdf file + - result['method'] = method used to find the identifier """ @@ -193,8 +196,9 @@ def __find_doi(file: io.IOBase) -> dict: def save_identifiers(filename_identifiers, results, clipboard=False): - ''' Write all identifiers contained in the input list 'results' into a text file with a path specified by filename_identifiers (if filename_identifiers is a - valid string) and/or into the clipboard (if clipboard = True). + ''' + Write all identifiers contained in the input list 'results' into a text file with a path specified by filename_identifiers (if filename_identifiers is a + valid string) and/or into the clipboard (if clipboard = True). Parameters ----------