Bare Metal install failure

I am trying to follow the baremetal install on a single node and it gets stuck on hive2.

I see:

ERROR [2021-06-07 08:52:58,773] se.kth.karamel.backend.dag.DagNode: Failed 'hive2::install on 172.17.0.3' because '172.17.0.3: Command did not complete: mkdir -p /home/hopsworks/.karamel/install ; cd /home/hopsworks/.karamel/install; echo $$ > pid; echo '#!/bin/bash
set -eo pipefail
echo $(date '+%H:%M:%S'): 'hive2__install' >> order
cat > hive2__install.json <<-'END_OF_FILE'
{
  "hopsmonitor": {
    "default": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "alertmanager": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "prometheus": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "node_exporter": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    }
  },
  "ndb": {
    "mysqld": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "ndbd": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "mgmd": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "NoOfReplicas": "1",
    "DataMemory": "4096"
  },
  "flink": {
    "yarn": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "historyserver": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    }
  },
  "hopslog": {
    "_filebeat-spark": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "_filebeat-jupyter": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "_filebeat-serving": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "_filebeat-beam": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "default": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    }
  },
  "kagent": {
    "default": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    }
  },
  "consul": {
    "master": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    }
  },
  "hops": {
    "rm": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "ndb": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "nn": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "nm": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "dn": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "docker_registry": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "jhs": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "tls": {
      "enabled": "false"
    },
    "rmappsecurity": {
      "actor_class": "org.apache.hadoop.yarn.server.resourcemanager.security.DevHopsworksRMAppSecurityActions"
    },
    "yarn": {
      "pcores-vcores-multiplier": "0.66",
      "cgroups_strict_resource_usage": "false",
      "detect-hardware-capabilities": "true",
      "system-reserved-memory-mb": "4000"
    }
  },
  "hops_airflow": {
    "default": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "sqoop": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    }
  },
  "kzookeeper": {
    "default": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    }
  },
  "epipe": {
    "default": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    }
  },
  "elastic": {
    "default": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "opendistro_security": {
      "logstash": {
        "password": "6bbb058e_201",
        "username": "logstash"
      },
      "epipe": {
        "username": "epipe",
        "password": "6bbb058e_201"
      },
      "admin": {
        "username": "admin",
        "password": "6bbb058e_201"
      },
      "audit": {
        "enable_transport": "false",
        "enable_rest": "true"
      },
      "jwt": {
        "exp_ms": "1800000"
      },
      "kibana": {
        "password": "6bbb058e_201",
        "username": "kibana"
      },
      "elastic_exporter": {
        "username": "elasticexporter",
        "password": "6bbb058e_201"
      }
    }
  },
  "hadoop_spark": {
    "historyserver": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "yarn": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    }
  },
  "hive2": {
    "default": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "mysql_password": "6bbb058e_203"
  },
  "kkafka": {
    "default": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    }
  },
  "tensorflow": {
    "default": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    }
  },
  "conda": {
    "default": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    }
  },
  "livy": {
    "default": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    }
  },
  "hopsworks": {
    "default": {
      "private_ips": [
        "172.17.0.3"
      ],
      "public_ips": [
        "172.17.0.3"
      ],
      "private_ips_domainIds": {
        "172.17.0.3": "0"
      },
      "hosts": {
        "172.17.0.3": "172.17.0.3"
      }
    },
    "application_certificate_validity_period": "6d",
    "kagent_liveness": {
      "threshold": "40s",
      "enabled": "true"
    },
    "requests_verify": "false",
    "featurestore_online": "true",
    "admin": {
      "password": "6bbb058e_201",
      "user": "adminuser"
    },
    "encryption_password": "6bbb058e_001",
    "master": {
      "password": "6bbb058e_002"
    },
    "https": {
      "port": "443"
    }
  },
  "install": {
    "kubernetes": "false",
    "dir": "/srv/hops",
    "cloud": "none"
  },
  "mysql": {
    "password": "6bbb058e_202"
  },
  "alertmanager": {
    "email": {
      "to": "sre@logicalclocks.com",
      "smtp_host": "mail.hello.com",
      "from": "hopsworks@logicalclocks.com"
    }
  },
  "prometheus": {
    "retention_time": "8h"
  },
  "private_ips": [
    "172.17.0.3"
  ],
  "public_ips": [
    "172.17.0.3"
  ],
  "hosts": {
    "172.17.0.3": "172.17.0.3"
  },
  "run_list": [
    "hive2::install"
  ]
}
END_OF_FILE
echo "%password_hidden%" | sudo -S  chef-solo -c /home/hopsworks/.karamel/install/solo.rb -j /home/hopsworks/.karamel/install/hive2__install.json 2>&1 | tee hive2__install.log
echo 'https://github.com/logicalclocks/hive-chef/tree/2.1/hive2::install' >> succeed_list
' > hive2__install.sh ; chmod +x hive2__install.sh ; ./hive2__install.sh
', DAG is stuck here :(

in the installation logs.
Any ideas?

Can you please post the content of this file?

/home/hopsworks/.karamel/install/hive2__install.log

I had blown away that one and tried again got stuck on livy this time but the error seems same as what I got for hive2 as well. livy__install.log is as follows

[sudo] password for hopsworks: Starting Chef Client, version 14.10.9
[2021-06-08T01:19:38+00:00] WARN: Plugin Network: unable to detect ipaddress
resolving cookbooks for run list: ["livy::install"]
Synchronizing Cookbooks:
  - livy (2.1.0)
  - java (7.0.0)
  - kagent (2.1.0)
  - hops (2.1.0)
  - ndb (2.1.0)
  - hadoop_spark (2.1.0)
  - conda (2.1.0)
  - homebrew (5.0.8)
  - windows (7.0.2)
  - consul (2.1.0)
  - hostsfile (2.4.6)
  - openssl (4.4.0)
  - ntp (2.0.3)
  - sudo (4.0.1)
  - magic_shell (1.0.0)
  - sysctl (1.0.5)
  - cmake (0.3.0)
  - kzookeeper (2.1.0)
  - elastic (2.1.0)
  - ulimit (2.1.0)
  - hive2 (2.1.0)
  - hopsmonitor (2.1.0)
  - chef-sugar (5.1.12)
  - ohai (5.3.0)
  - ulimit2 (0.2.0)
  - elasticsearch (4.0.6)
  - authbind (0.1.10)
  - compat_resource (12.19.1)
  - tensorflow (2.1.0)
  - hops_airflow (2.1.0)
  - apt (7.2.0)
  - yum (6.1.1)
  - ark (6.0.1)
  - build-essential (8.2.1)
  - zip (1.1.0)
  - poise-python (1.7.0)
  - seven_zip (4.2.1)
  - mingw (2.1.1)
  - poise (2.8.2)
  - poise-languages (2.1.2)
  - poise-archive (1.5.0)

Running handlers:
[2021-06-08T01:19:40+00:00] ERROR: Running exception handlers
Running handlers complete
[2021-06-08T01:19:40+00:00] ERROR: Exception handlers complete
Chef Client failed. 0 resources updated in 02 seconds
[2021-06-08T01:19:40+00:00] FATAL: Stacktrace dumped to /tmp/chef-solo/chef-stacktrace.out
[2021-06-08T01:19:40+00:00] FATAL: Please provide the contents of the stacktrace.out file if you file a bug report
[2021-06-08T01:19:40+00:00] FATAL: Chef::Exceptions::CookbookChefVersionMismatch: Cookbook 'ark' version '6.0.1' depends on chef version [">= 15.3"], but the running chef version is 14.10.9

the chef-stacktrace.out

[sudo] password for hopsworks: 
Generated at 2021-06-08 01:19:40 +0000
Chef::Exceptions::CookbookChefVersionMismatch: Cookbook 'ark' version '6.0.1' depends on chef version [">= 15.3"], but the running chef version is 14.10.9
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/lib/chef/cookbook/metadata.rb:613:in `validate_chef_version!'
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/lib/chef/cookbook/cookbook_collection.rb:54:in `block in validate!'
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/lib/chef/cookbook/cookbook_collection.rb:53:in `each_value'
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/lib/chef/cookbook/cookbook_collection.rb:53:in `validate!'
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/lib/chef/policy_builder/expand_node_object.rb:85:in `setup_run_context'
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/lib/chef/client.rb:515:in `setup_run_context'
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/lib/chef/client.rb:281:in `run'
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/lib/chef/application.rb:303:in `run_with_graceful_exit_option'
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/lib/chef/application.rb:279:in `block in run_chef_client'
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/lib/chef/local_mode.rb:44:in `with_server_connectivity'
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/lib/chef/application.rb:261:in `run_chef_client'
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/lib/chef/application/client.rb:444:in `run_application'
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/lib/chef/application.rb:66:in `run'
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/lib/chef/application/solo.rb:224:in `run'
/opt/chefdk/embedded/lib/ruby/gems/2.5.0/gems/chef-14.10.9/bin/chef-solo:24:in `<top (required)>'
/usr/bin/chef-solo:306:in `load'
/usr/bin/chef-solo:306:in `<main>'

Looking at the cookbook ark notes ark/CHANGELOG.md at master · sous-chefs/ark · GitHub I see:

ark Cookbook CHANGELOG

This file is used to list changes made in each version of the ark cookbook.

Unreleased

6.0.1 - 2021-06-01

6.0.0 - 2021-05-22

  • Chef 17 updates: enable unified_mode on all resources
  • Bump required Chef Infra Client to >= 15.3
  • Migrate to using seven_zip_tool resource directly and require seven_zip >= 3.1
  • Various ChefSpec fixes

Guessing the chef infra bump from 6.0.0 broke it?

Guessing you need to bump all these: Fix conflicting ark dependencies · logicalclocks/karamel-chef@3c285e5 · GitHub to 15.3 and above

I had a quick crack at doing this and the new chef requires a EULA:
see: Accepting the Chef License

Looks like some extra hacking required on that not sure of any commercial implications though of it

UPDATE

installing the latest chefdk before triggering the hopsworks installer fixes it but need to run:

chef-client --chef-license accept > /dev/null

first to enable that

@Data-drone: I don’t think that bumping the version of chef is going to work, you’ll have more issues down the line.

I believe you are pulling down some old version of the cookbooks somehow. Both the ark dependency issue and this issue: Bare metal install pulling old tensorflow-chef? was solved in Hopsworks 2.2 (the most recent release) and the current master.

Can you make sure you are running the most recent version of the script: https://raw.githubusercontent.com/logicalclocks/karamel-chef/2.2/hopsworks-installer.sh - note the 2.2. in the url

ah okay I was following the github README which said use:
bash <(curl -s https://repo.hops.works/installer/latest/hopsworks-installer.sh)

guessing that link hasn’t been updated to 2.2

Yes it’s not updated -it’s pulling 2.1 - Use this one instead: https://raw.githubusercontent.com/logicalclocks/karamel-chef/2.2/hopsworks-installer.sh

I have used only 2.2 and after a purge and when I try to reinstall I do get:

2021-08-16T08:41:43+00:00] FATAL: Chef::Exceptions::CookbookChefVersionMismatch
: Cookbook 'yum' version '7.0.0' depends on chef version [">= 15.3"], but the ru
nning chef version is 14.10.9

I also did a completely fresh “bare-metal” deployment on an ubuntu 18.04 now - same thing but with different recipe:

.karamel/install/hadoop_spark__install.log 
...
2021-08-16T09:21:38+00:00] FATAL: Chef::Exceptions::CookbookChefVersionMismatch
: Cookbook 'yum' version '7.0.0' depends on chef version [">= 15.3"], but the ru
nning chef version is 14.10.9

On-prem seems broken?!

We are looking into the yum cookbook issue

1 Like

A cookbook that we depend on was updated to chef 15, while we are running running chef 14 which is causing your issue. The issue was fixed now and we ask that you retry your installation.

1 Like

Works now - many thanks!