Skip to content

Commit

Permalink
Merge pull request #1501 from seek4science/license-extraction
Browse files Browse the repository at this point in the history
License extraction
  • Loading branch information
fbacall authored Jul 10, 2023
2 parents 995fca3 + 172a047 commit 88c7d87
Show file tree
Hide file tree
Showing 18 changed files with 464 additions and 206 deletions.
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ gem 'caxlsx_rails', '~> 0.6.2'

gem 'net-ftp'

gem 'licensee'

group :production do
gem 'passenger'
end
Expand Down
19 changes: 17 additions & 2 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,12 @@ GEM
sxp (~> 1.2)
libreconv (0.9.5)
libxml-ruby (2.9.0)
licensee (9.16.0)
dotenv (~> 2.0)
octokit (>= 4.20, < 7.0)
reverse_markdown (>= 1, < 3)
rugged (>= 0.24, < 2.0)
thor (>= 0.19, < 2.0)
link_header (0.0.8)
linkeddata (3.2.0)
json-ld (~> 3.2)
Expand Down Expand Up @@ -484,7 +490,7 @@ GEM
net-protocol
netrc (0.11.0)
nio4r (2.5.9)
nokogiri (1.14.5-x86_64-linux)
nokogiri (1.14.5)
racc (~> 1.4)
nori (1.1.5)
oauth2 (2.0.9)
Expand All @@ -494,6 +500,9 @@ GEM
rack (>= 1.2, < 4)
snaky_hash (~> 2.0)
version_gem (~> 1.1)
octokit (6.1.1)
faraday (>= 1, < 3)
sawyer (~> 0.9)
omniauth (2.1.0)
hashie (>= 3.4.6)
rack (>= 2.2.3)
Expand Down Expand Up @@ -712,6 +721,8 @@ GEM
http-cookie (>= 1.0.2, < 2.0)
mime-types (>= 1.16, < 4.0)
netrc (~> 0.8)
reverse_markdown (2.1.1)
nokogiri
rexml (3.2.5)
rfc-822 (0.4.1)
rmagick (4.2.5)
Expand Down Expand Up @@ -781,6 +792,9 @@ GEM
nokogiri (>= 1.4.0)
nori (~> 1.1.0)
wasabi (~> 2.5.0)
sawyer (0.9.2)
addressable (>= 2.3.5)
faraday (>= 0.17.3, < 3)
sax-machine (1.3.2)
scanf (1.0.0)
seedbank (0.5.0)
Expand Down Expand Up @@ -933,7 +947,7 @@ GEM
rubyzip (~> 2.0.0)

PLATFORMS
x86_64-linux
ruby

DEPENDENCIES
RedCloth (>= 4.3.0)
Expand Down Expand Up @@ -991,6 +1005,7 @@ DEPENDENCIES
json-schema
libreconv
libxml-ruby (~> 2.9.0)
licensee
linkeddata (~> 3.2.0)
listen (~> 3.3)
lograge
Expand Down
8 changes: 4 additions & 4 deletions app/models/git/blob.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ def url
git_version.remote_sources[path]
end

def file
@file ||= to_tempfile
def file(fetch_remote: false)
@file ||= to_tempfile(fetch_remote: fetch_remote)
end

def binread
Expand Down Expand Up @@ -134,10 +134,10 @@ def is_text?

private

def to_tempfile
def to_tempfile(fetch_remote: false)
f = Tempfile.new(path)
f.binmode if binary?
f << file_contents(as_text: !binary?)
f << file_contents(as_text: !binary?, fetch_remote: fetch_remote)
f.rewind
f
end
Expand Down
23 changes: 23 additions & 0 deletions lib/licensee/projects/git_version_project.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
module Licensee
module Projects
# Custom Project class because the default GitProject class does not allow Repos with `head_unborn?`, even though
# it's OK in our case since we are passing an explicit revision.
class GitVersionProject < ::Licensee::Projects::GitProject
def initialize(git_version, detect_packages: false, detect_readme: false)
@detect_packages = detect_packages
@detect_readme = detect_readme
@git_version = git_version
end

# The Rugged::Repository for the Git::Version
def repository
@git_version.git_repository.git_base.base
end

# The Rugged::Commit for the Git::Version
def commit
@git_version.commit_object
end
end
end
end
32 changes: 32 additions & 0 deletions lib/licensee/projects/ro_crate_project.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
module Licensee
module Projects
# Project class for finding license files in an RO-Crate
class RoCrateProject < ::Licensee::Projects::Project
def initialize(ro_crate, **args)
@ro_crate = ro_crate
super(**args)
end

def files
@files ||= @ro_crate.entries.each.map do |path, entry|
next if entry.remote?
next if entry.directory?
split = path.split('/')
name = split.last
if split.length > 1
dir = split[0..-2].join('/')
else
dir = ''
end
{ name: name, dir: dir }
end.compact
end

def load_file(file)
path = file[:dir].blank? ? file[:name] : "#{file[:dir]}/#{file[:name]}"
entry = @ro_crate.entries[path]
entry.read
end
end
end
end
4 changes: 0 additions & 4 deletions lib/ro_crate/workflow_crate.rb
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,6 @@ def test_suites
((mentions || []) | (about || [])).select { |entity| entity.has_type?('TestSuite') }
end

def readme
dereference('README.md')
end

def find_entry(path)
entries[path]
end
Expand Down
12 changes: 12 additions & 0 deletions lib/seek/workflow_extractors/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,18 @@ def self.file_extensions

private

def extract_license(licensee_project)
license = nil
begin
::Licensee::License # Reference License class otherwise it cannot find ::Licensee::InvalidLicense
license = licensee_project&.license&.spdx_id
rescue ::Licensee::InvalidLicense
rescue ::Licensee::Projects::GitProject::InvalidRepository => e
raise e unless Rails.env.production?
end
license
end

# Extract author from a string or a Hash complying to schema.org's `Person`
def extract_author(obj)
author = {}
Expand Down
16 changes: 10 additions & 6 deletions lib/seek/workflow_extractors/cff.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,20 @@ class CFF
FILENAME = 'CITATION.cff'

def initialize(io)
@io = io.is_a?(String) ? StringIO.new(io) : io
if io.respond_to?(:path)
@path = io.path
else
f = Tempfile.new('cff')
f.binmode
f.write(io.read)
f.rewind
@path = f.path
end
end

def metadata
metadata = {}
f = Tempfile.new('cff')
f.binmode
f.write(@io.read)
f.rewind
cff = ::CFF::File.read(f.path)
cff = ::CFF::File.read(@path)

other_creators = []
cff.authors.each_with_index do |author, i|
Expand Down
88 changes: 19 additions & 69 deletions lib/seek/workflow_extractors/git_repo.rb
Original file line number Diff line number Diff line change
@@ -1,92 +1,42 @@
require 'rest-client'
require 'ro_crate'
require 'licensee'

module Seek
module WorkflowExtractors
class GitRepo < Base
def initialize(git_version, main_workflow_class: nil)
@git_version = git_version
@main_workflow_class = main_workflow_class
end

def can_render_diagram?
@git_version.path_for_key(:diagram).present? || main_workflow_extractor&.can_render_diagram? || abstract_cwl_extractor&.can_render_diagram?
end

def diagram_extension
path = @git_version.path_for_key(:diagram)
return path.split('.').last if path

super
end

def generate_diagram
if @git_version.path_for_key(:diagram).present?
@git_version.file_contents(@git_version.path_for_key(:diagram))
elsif main_workflow_extractor&.can_render_diagram?
main_workflow_extractor.generate_diagram
elsif abstract_cwl_extractor&.can_render_diagram?
abstract_cwl_extractor.generate_diagram
else
nil
end
end

class GitRepo < ROLike
def metadata
# Use CWL description
m = if @git_version.path_for_key(:abstract_cwl).present?
begin
abstract_cwl_extractor.metadata
rescue StandardError => e
Rails.logger.error('Error extracting abstract CWL:')
Rails.logger.error(e)
{ errors: ["Couldn't parse abstract CWL"] }
end
else
begin
main_workflow_extractor.metadata
rescue StandardError => e
Rails.logger.error('Error extracting workflow:')
Rails.logger.error(e)
{ errors: ["Couldn't parse main workflow"] }
end
end

if @git_version.file_exists?('README.md')
m[:description] ||= @git_version.file_contents('README.md').force_encoding('utf-8')
end
m = super

m.reverse_merge!(cff_extractor.metadata) if cff_extractor

m[:source_link_url] ||= @git_version.git_repository&.remote
m[:source_link_url] ||= @obj.git_repository&.remote

m
end

private

def main_workflow_extractor
return @main_workflow_extractor if defined?(@main_workflow_extractor)

workflow_class = @main_workflow_class
extractor_class = workflow_class&.extractor_class || Seek::WorkflowExtractors::Base
main_workflow_path = @git_version.path_for_key(:main_workflow)
@main_workflow_extractor = main_workflow_path ? extractor_class.new(@git_version.file_contents(main_workflow_path, fetch_remote: true)) : nil
def main_workflow_path
@obj.path_for_key(:main_workflow)
end

def abstract_cwl_extractor
return @abstract_cwl_extractor if defined?(@abstract_cwl_extractor)
def abstract_cwl_path
@obj.path_for_key(:abstract_cwl)
end

abstract_cwl_path = @git_version.path_for_key(:abstract_cwl)
@abstract_cwl_extractor = abstract_cwl_path ? Seek::WorkflowExtractors::CWL.new(@git_version.file_contents(abstract_cwl_path, fetch_remote: true)) : nil
def diagram_path
@obj.path_for_key(:diagram)
end

def cff_extractor
return @cff_extractor if defined?(@cff_extractor)
def file_exists?(path)
@obj.file_exists?(path)
end

cff = @git_version.get_blob(Seek::WorkflowExtractors::CFF::FILENAME)
def file(path)
@obj.get_blob(path)&.file(fetch_remote: true)
end

@cff_extractor = cff ? Seek::WorkflowExtractors::CFF.new(cff) : nil
def licensee_project
@licensee_project ||= ::Licensee::Projects::GitVersionProject.new(@obj)
end
end
end
Expand Down
Loading

0 comments on commit 88c7d87

Please sign in to comment.